通過top指令檢視監控,發現有7000+的僵屍程序
那麼先把僵屍程序找出來[root@izbp152ke14timzud0du15z ~]# ps -ef |grep defunct |more
root 303 566 0 01:25 ? 00:00:00 [python] <defunct>
root 310 566 0 01:44 ? 00:00:00 [python] <defunct>
root 313 566 0 13:02 ? 00:00:00 [python] <defunct>
root 316 566 0 09:27 ? 00:00:00 [python] <defunct>
root 319 566 0 05:08 ? 00:00:00 [python] <defunct>
root 329 566 0 02:22 ? 00:00:00 [python] <defunct>
root 331 566 0 04:13 ? 00:00:00 [python] <defunct>
root 332 566 0 05:26 ? 00:00:00 [python] <defunct>
root 334 566 0 03:55 ? 00:00:00 [python] <defunct>
root 353 566 0 Nov01 ? 00:00:00 [python] <defunct>
root 354 566 0 07:33 ? 00:00:00 [python] <defunct>
root 356 566 0 09:07 ? 00:00:00 [python] <defunct>
root 363 566 0 Nov01 ? 00:00:00 [python] <defunct>
root 366 566 0 11:23 ? 00:00:00 [python] <defunct>
root 372 566 0 06:38 ? 00:00:00 [python] <defunct>
root 377 566 0 11:43 ? 00:00:00 [python] <defunct>
root 378 566 0 14:39 ? 00:00:00 [python] <defunct>
root 379 566 0 11:03 ? 00:00:00 [python] <defunct>
root 390 566 0 01:06 ? 00:00:00 [python] <defunct>
root 391 566 0 14:01 ? 00:00:00 [python] <defunct>
root 395 566 0 Nov01 ? 00:00:00 [python] <defunct>
結果中第3列
566
就是僵屍程序的父程序,那麼看看這個是什麼程序
# top -p 566
top - 15:18:23 up 21:21, 1 user, load average: 0.56, 0.43, 0.56
Tasks: 1 total, 0 running, 1 sleeping, 0 stopped, 0 zombie
%Cpu(s): 4.5 us, 5.0 sy, 0.0 ni, 90.2 id, 0.1 wa, 0.0 hi, 0.2 si, 0.0 st
KiB Mem : 7732980 total, 800156 free, 3259668 used, 3673156 buff/cache
KiB Swap: 0 total, 0 free, 0 used. 4162612 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
566 root 20 0 1254132 538104 14288 S 0.7 7.0 35:53.07 datakit
發現是一個名為
datakit
的程序,那麼接下來就該去檢視代碼排除問題了
把父程序殺掉,殺掉父程序後,僵屍程序資源自然會被回收
# kill -9 566
但是問題就這樣結束了嗎,當然沒有,問題原因總歸要找到
重新啟動程序後,自然會重新生成僵屍程序,通過檢視程序樹
最上面的父程序還是為datakit,datakit下面生成了很多的Python僵屍程序
6206 ? Ssl 0:50 /usr/local/datakit/datakit
6550 ? Sl 0:00 \_ /usr/local/datakit/externals/oracle --interval 1m --host <your-oracle-host> --port 1521 --userna
6721 ? Z 0:00 \_ [python] <defunct>
6951 ? Z 0:00 \_ [python] <defunct>
7179 ? Z 0:00 \_ [python] <defunct>
7356 ? Z 0:00 \_ [python] <defunct>
7550 ? Z 0:00 \_ [python] <defunct>
7729 ? Z 0:00 \_ [python] <defunct>
7920 ? Z 0:00 \_ [python] <defunct>
8100 ? Z 0:00 \_ [python] <defunct>
8277 ? Z 0:00 \_ [python] <defunct>
8466 ? Z 0:00 \_ [python] <defunct>
8666 ? Z 0:00 \_ [python] <defunct>
既然知道程式datakit會産生很多Python的僵屍程序,那麼就在代碼中,搜尋關于執行Python程序的代碼,而且僵屍程序這麼多,可能是不止一處調用,或者是存在循環調用
果然經過一番搜尋
func buildExternals(outdir, goos, goarch string) {
curOSArch := runtime.GOOS + "/" + runtime.GOARCH
for _, ex := range externals {
l.Debugf("building %s-%s/%s", goos, goarch, ex.name)
if _, ok := ex.osarchs[curOSArch]; !ok {
l.Warnf("skip build %s under %s", ex.name, curOSArch)
continue
}
osarch := goos + "/" + goarch
if _, ok := ex.osarchs[osarch]; !ok {
l.Warnf("skip build %s under %s", ex.name, osarch)
continue
}
out := ex.name
switch strings.ToLower(ex.lang) {
case "go", "golang":
switch osarch {
case "windows/amd64", "windows/386":
out += ".exe"
default: // pass
}
args := []string{
"go", "build",
"-o", filepath.Join(outdir, "externals", out),
"-ldflags",
"-w -s",
filepath.Join("plugins", "externals", ex.name, ex.entry),
}
ex.envs = append(ex.envs, "GOOS="+goos, "GOARCH="+goarch)
msg, err := runEnv(args, ex.envs)
if err != nil {
l.Fatalf("failed to run %v, envs: %v: %v, msg: %s",
args, ex.envs, err, string(msg))
}
case "makefile", "Makefile":
args := []string{
"make",
"--file=" + filepath.Join("plugins", "externals", ex.name, ex.entry),
"OUTPATH=" + filepath.Join(outdir, "externals", out),
"BASEPATH=" + "plugins/externals/" + ex.name,
}
ex.envs = append(ex.envs, "GOOS="+goos, "GOARCH="+goarch)
msg, err := runEnv(args, ex.envs)
if err != nil {
l.Fatalf("failed to run %v, envs: %v: %v, msg: %s",
args, ex.envs, err, string(msg))
}
default: // for python, just copy source code into build dir
ex.buildArgs = append(ex.buildArgs, filepath.Join(outdir, "externals"))
cmd := exec.Command(ex.buildCmd, ex.buildArgs...) //nolint:gosec
if ex.envs != nil {
cmd.Env = append(os.Environ(), ex.envs...)
}
res, err := cmd.CombinedOutput()
if err != nil {
l.Fatalf("failed to build python(%s %s): %s, err: %s",
ex.buildCmd, strings.Join(ex.buildArgs, " "), res, err.Error())
}
}
}
}
該函數中,for循環語句塊,通過
switch
分支中的
default
執行了
exec.Command
,但如果建立的子程序,父程序不知道的話就會産生僵屍程序了
default: // for python, just copy source code into build dir
ex.buildArgs = append(ex.buildArgs, filepath.Join(outdir, "externals"))
cmd := exec.Command(ex.buildCmd, ex.buildArgs...) //nolint:gosec
if ex.envs != nil {
cmd.Env = append(os.Environ(), ex.envs...)
}
res, err := cmd.CombinedOutput()
if err != nil {
l.Fatalf("failed to build python(%s %s): %s, err: %s",
ex.buildCmd, strings.Join(ex.buildArgs, " "), res, err.Error())
}
應該加上以下代碼,防止僵屍程序
if err:=cmd.Wait();err!=nil{
fmt.Println(err)
}