除比較常見的核心 panic 與 soft lockup 外,普通的核心死鎖可能并不會對作業系統産生緻命的影響,例如馬上要分析到的這個 case —— 某個運維同學發現在 ECS 上執行 top 并按下 c 後會 hang 住,且無法響應任何指令。
經過觀察,在 top 中按下 c 是打開/關閉程序啟動時的完整指令,由于隻是 top 程序 hang,建立一個 shell 可以觀察到 top 程序處于 UN 狀态,檢視 stack 實際上是由于 rwsem_down_read_failed 被排程走了。rwsem_down_read_failed 是嘗試讀取 rw_semaphore 信号量失敗時會調用的函數,是以關鍵在于這個信号量具體是什麼?又是誰拿走了這個信号量?話不多說,直接上 core。
信号量位址推導
core 裡抓到了好幾個 UN 狀态的 top,随便找一個看,是在從 proc 檔案系統中讀取 /proc/4424/cmdline
crash> bt
PID: 28968 TASK: ffff88041a820fb0 CPU: 3 COMMAND: "top"
#0 [ffff880387b8bd28] __schedule at ffffffff8168c1a5
#1 [ffff880387b8bd90] schedule at ffffffff8168c7f9
#2 [ffff880387b8bda0] rwsem_down_read_failed at ffffffff8168e1a5
#3 [ffff880387b8be08] call_rwsem_down_read_failed at ffffffff81327618
#4 [ffff880387b8be58] down_read at ffffffff8168b980
#5 [ffff880387b8be70] proc_pid_cmdline_read at ffffffff8126f712
#6 [ffff880387b8bf00] vfs_read at ffffffff811fe86e
#7 [ffff880387b8bf38] sys_read at ffffffff811ff43f
#8 [ffff880387b8bf80] system_call_fastpath at ffffffff81697809
RIP: 00007f83249077e0 RSP: 00007fff1f5c99e8 RFLAGS: 00000246
RAX: 0000000000000000 RBX: ffffffff81697809 RCX: ffffffffffffffff
RDX: 0000000000020000 RSI: 0000000000c07700 RDI: 0000000000000009
RBP: 0000000000020000 R8: 00007f8324866988 R9: 0000000000000012
R10: 0000000000000007 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000c07700 R14: 0000000000000000 R15: 0000000000c07700
ORIG_RAX: 0000000000000000 CS: 0033 SS: 002b
crash> files
PID: 28968 TASK: ffff88041a820fb0 CPU: 3 COMMAND: "top"
ROOT: / CWD: /root
FD FILE DENTRY INODE TYPE PATH
0 ffff8804c0f47900 ffff88017f80ad80 ffff8807e05a7028 CHR /dev/tty1
1 ffff8804c0f47900 ffff88017f80ad80 ffff8807e05a7028 CHR /dev/tty1
2 ffff8804bfadbc00 ffff88017f80a240 ffff8807e05a4850 CHR /dev/null
3 ffff8804c0f47900 ffff88017f80ad80 ffff8807e05a7028 CHR /dev/tty1
4 ffff8804bfadb400 ffff880449bc18c0 ffff8802e1bad750 REG /proc/stat
5 ffff8804bfadb000 ffff8807dc5bf980 ffff88048fbfdf00 REG /proc/uptime
6 ffff8803d3217200 ffff8807dc5befc0 ffff88048fbfd750 REG /proc/meminfo
7 ffff8800686c5200 ffff8802e290c240 ffff8802e290ae60 REG /proc/loadavg
8 ffff8800686c5000 ffff88017f808240 ffff88017f80c040 DIR /proc/
9 ffff8804bf16c400 ffff8806afc70900 ffff8805366f1f00 REG /proc/4424/cmdline
可以看到是 proc_pid_cmdline_read 在 down_read 的時候失敗了,相關代碼在 238 行:
203 static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
204 size_t _count, loff_t *pos)
205 {
206 struct task_struct *tsk;
207 struct mm_struct *mm;
208 char *page;
209 unsigned long count = _count;
210 unsigned long arg_start, arg_end, env_start, env_end;
211 unsigned long len1, len2, len;
212 unsigned long p;
213 char c;
214 ssize_t rv;
215
216 BUG_ON(*pos < 0);
217
218 tsk = get_proc_task(file_inode(file));
219 if (!tsk)
220 return -ESRCH;
221 mm = get_task_mm(tsk);
222 put_task_struct(tsk);
223 if (!mm)
224 return 0;
225 /* Check if process spawned far enough to have cmdline. */
226 if (!mm->env_end) {
227 rv = 0;
228 goto out_mmput;
229 }
230
231 page = (char *)__get_free_page(GFP_TEMPORARY);
232 if (!page) {
233 rv = -ENOMEM;
234 goto out_mmput;
235 }
236
237 down_read(&mm->mmap_sem);
238 arg_start = mm->arg_start;
239 arg_end = mm->arg_end;
240 env_start = mm->env_start;
241 env_end = mm->env_end;
242 up_read(&mm->mmap_sem);
......
有多種方法可以找到這裡的 &mm->mmap_sem。這裡通過彙編和棧中的資料來嘗試推導。在調用點附近可以看到,proc_pid_cmdline_read 在調用 down_read 之前,把 mmap_sem 拷貝到了 [rbp-0x60] 中:
0xffffffff8126f6eb <proc_pid_cmdline_read+139>: mov edi,0x800d0
0xffffffff8126f6f0 <proc_pid_cmdline_read+144>: call 0xffffffff81185f70 <__get_free_pages>
0xffffffff8126f6f5 <proc_pid_cmdline_read+149>: test rax,rax
0xffffffff8126f6f8 <proc_pid_cmdline_read+152>: mov QWORD PTR [rbp-0x40],rax
0xffffffff8126f6fc <proc_pid_cmdline_read+156>: je 0xffffffff8126f9f0 <proc_pid_cmdline_read+912>
0xffffffff8126f702 <proc_pid_cmdline_read+162>: lea rax,[rbx+0x78]
0xffffffff8126f706 <proc_pid_cmdline_read+166>: mov rdi,rax
0xffffffff8126f709 <proc_pid_cmdline_read+169>: mov QWORD PTR [rbp-0x60],rax
0xffffffff8126f70d <proc_pid_cmdline_read+173>: call 0xffffffff8168b960 <down_read>
0xffffffff8126f712 <proc_pid_cmdline_read+178>: mov rax,QWORD PTR [rbx+0x128]
由于在後續的調用中,proc_pid_cmdline_read 函數的棧幀不會改變,是以将 proc_pid_cmdline_read 函數的棧底減去 0x60 就能得到 mmap_sem 的位址,即 ffff8801f7b151b8
#5 [ffff880387b8be70] proc_pid_cmdline_read at ffffffff8126f712
ffff880387b8be78: ffff8804bf16c400 0000000000020000
ffff880387b8be88: ffff8805366f1f00 ffff8804bf16c410
ffff880387b8be98: ffff8801f7b151b8 ffff880387b8bed0
ffff880387b8bea8: ffffffff812a9504 0000000000020000
ffff880387b8beb8: ffff8804897de000 0000000000000000
ffff880387b8bec8: 00000000f38e5979 ffff8804bf16c400
ffff880387b8bed8: 0000000000c07700 ffff880387b8bf48
ffff880387b8bee8: 0000000000020000 0000000000000009
ffff880387b8bef8: ffff880387b8bf30 ffffffff811fe86e
信号量的等待隊列
上一節中找到了 top 等待的信号量 mmap_sem 的位址是 ffff8801f7b151b8,這是一個 rw_semaphore 類型的變量,在核心中這個變量通常用在讀多寫少的場景。
crash> rw_semaphore ffff8801f7b151b8
struct rw_semaphore {
count = -4294967295,
wait_lock = {
raw_lock = {
{
head_tail = 195300260,
tickets = {
head = 2980,
tail = 2980
}
}
}
},
wait_list = {
next = 0xffff8807d9b03dd0,
prev = 0xffff8804d1f6bdb0
}
}
在 rw_semaphore 的實作中可以發現,有另一個變量 rwsem_waiter 中的 task 成員會記錄等待 rw_semaphore 信号量的程序,而 rw_semaphore.wait_list 就是 rwsem_waiter.list,是以通過 rwsem_waiter 來解析 rw_semaphore.wait_list 可以得到程序等待隊列。
crash> list rwsem_waiter.list -s rwsem_waiter.task,type -h 0xffff8807d9b03dd0
ffff8807d9b03dd0
task = 0xffff880426cbaf10
type = RWSEM_WAITING_FOR_WRITE
ffff8802d3c17db0
task = 0xffff8802b3bd4e70
type = RWSEM_WAITING_FOR_READ
ffff8807de05fdb0
task = 0xffff8802a1e03ec0
type = RWSEM_WAITING_FOR_READ
ffff88018dbe3db0
task = 0xffff88018da3de20
type = RWSEM_WAITING_FOR_READ
ffff88011032bdb0
task = 0xffff8807c1fd3ec0
type = RWSEM_WAITING_FOR_READ
ffff8804fd3fbdb0
task = 0xffff8800ba3f4e70
type = RWSEM_WAITING_FOR_READ
ffff8807ffd87db0
task = 0xffff880012dd8fb0
type = RWSEM_WAITING_FOR_READ
ffff8801bc5ebdb0
task = 0xffff88046094de20
type = RWSEM_WAITING_FOR_READ
ffff8805c11b7db0
task = 0xffff8807bcf8edd0
type = RWSEM_WAITING_FOR_READ
......
誰拿走了信号量?
上一節中擷取到的等待 mmap_sem 的隊列非常長,足足有一千多個,即有一千多個程序在等待 mmap_sem 而處于 UN 狀态。要怎麼樣才能知道到底是誰拿走了這個信号量呢?換個方向來思考,不難想到不管是等待 mmap_sem 的程序還是已經拿走了 mmap_sem 的程序,它一定像 top 一樣是通過 down_read/write 函數來擷取的,也一樣要經曆寄存器傳遞、将變量壓棧的過程,是以它們的核心函數棧幀中應該保留有 mmap_sem 的位址。将所有棧幀中保留有 mmap_sem 位址的程序與等待隊列中的程序一對比,就能知道誰是那個占着雞窩不下蛋的程序了。先從數量上對比,發現堆棧中有 mmap_sem 位址的程序恰好比等待隊列中的程序多一個。
crash> search -t ffff8801f7b151b8 | grep TASK | wc -l
1470
crash> list rwsem_waiter.list -s rwsem_waiter.task -h 0xffff8807d9b03dd0 | grep task | wc -l
1469
......
順藤摸瓜不難找到,多出來的程序是 PID 為 4442 的程序
crash> bt 4442
PID: 4442 TASK: ffff880426cbbec0 CPU: 2 COMMAND: "filebeat"
#0 [ffff8807a6643690] __schedule at ffffffff8168c1a5
#1 [ffff8807a66436f8] schedule at ffffffff8168c7f9
#2 [ffff8807a6643708] schedule_timeout at ffffffff8168a239
#3 [ffff8807a66437b0] io_schedule_timeout at ffffffff8168bd9e
#4 [ffff8807a66437e0] io_schedule at ffffffff8168be38
#5 [ffff8807a66437f0] bt_get at ffffffff812fb915
#6 [ffff8807a6643860] blk_mq_get_tag at ffffffff812fbe7f
#7 [ffff8807a6643888] __blk_mq_alloc_request at ffffffff812f725b
#8 [ffff8807a66438b8] blk_mq_map_request at ffffffff812f96d1
#9 [ffff8807a6643928] blk_sq_make_request at ffffffff812fa430
#10 [ffff8807a66439b0] generic_make_request at ffffffff812eee69
#11 [ffff8807a66439f8] submit_bio at ffffffff812eefb1
#12 [ffff8807a6643a50] do_mpage_readpage at ffffffff8123ffed
#13 [ffff8807a6643b28] mpage_readpages at ffffffff8124058b
#14 [ffff8807a6643bf8] ext4_readpages at ffffffffa01df23c [ext4]
#15 [ffff8807a6643c08] __do_page_cache_readahead at ffffffff8118dd2c
#16 [ffff8807a6643cc8] ra_submit at ffffffff8118e3c1
#17 [ffff8807a6643cd8] filemap_fault at ffffffff811836f5
#18 [ffff8807a6643d38] ext4_filemap_fault at ffffffffa01e8016 [ext4]
#19 [ffff8807a6643d60] __do_fault at ffffffff811ac83c
#20 [ffff8807a6643db0] do_read_fault at ffffffff811accd3
#21 [ffff8807a6643e00] handle_mm_fault at ffffffff811b1461
#22 [ffff8807a6643e98] __do_page_fault at ffffffff81692cc4
#23 [ffff8807a6643ef8] trace_do_page_fault at ffffffff816930a6
#24 [ffff8807a6643f38] do_async_page_fault at ffffffff8169274b
#25 [ffff8807a6643f50] async_page_fault at ffffffff8168f238
RIP: 0000000000adf1f9 RSP: 00007fcefbe06860 RFLAGS: 00010297
RAX: 0000000000000004 RBX: 0000000000000000 RCX: 0000000000ad1100
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 00007fcefbe06b28 R8: 000000c420066080 R9: 000000007fffffff
R10: 0000000001a14630 R11: 0000000001e89ee0 R12: 000000c42239cd70
R13: 0000000001a14630 R14: 0000000000aea430 R15: 0000000000000000
ORIG_RAX: ffffffffffffffff CS: 0033 SS: 002b
這個程序看起來 hang 在了 io 上,通過回溯函數調用可以發現,在 __do_page_fault 函數中曾經擷取過 mmap_sem 信号量:
1122 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1123 if ((error_code & PF_USER) == 0 &&
1124 !search_exception_tables(regs->ip)) {
1125 bad_area_nosemaphore(regs, error_code, address);
1126 return;
1127 }
1128 retry:
1129 down_read(&mm->mmap_sem);
1130 } else {
1131 /*
1132 * The above down_read_trylock() might have succeeded in
1133 * which case we'll have missed the might_sleep() from
1134 * down_read():
1135 */
1136 might_sleep();
1137 }
至于為什麼 4442 程序一直都沒有釋放 mmap_sem,經過一番查找後發現應該是踩到了 bt_get 的核心 bug 而一直 hang 在這個函數中:
https://lore.kernel.org/lkml/[email protected]/#Z30::20block:blk-mq-tag.c。再回過頭來看 top 是在讀取 /proc/4424/cmdline 時 hang 的,4442 與 4424 同屬一個線程組,共享 mm_struct,自然 mmap_sem 也是相同的。在 4424 的程序 down_read(&mm->mmap_sem); 之後,等待隊列中的第一個程序 0xffff880426cbaf10 嘗試 down_write,而 down_read 和 down_write 是互斥的,導緻後續所有請求讀 mmap_sem(mm_struct)的程序都進入了等待隊列中,也就出現了 top 按 c 後 hang 住的現象。