天天看點

系統中出現僵屍程序排查過程

通過top指令檢視監控,發現有7000+的僵屍程序

系統中出現僵屍程式排查過程
那麼先把僵屍程序找出來

[root@izbp152ke14timzud0du15z ~]# ps -ef |grep defunct |more
root       303   566  0 01:25 ?        00:00:00 [python] <defunct>
root       310   566  0 01:44 ?        00:00:00 [python] <defunct>
root       313   566  0 13:02 ?        00:00:00 [python] <defunct>
root       316   566  0 09:27 ?        00:00:00 [python] <defunct>
root       319   566  0 05:08 ?        00:00:00 [python] <defunct>
root       329   566  0 02:22 ?        00:00:00 [python] <defunct>
root       331   566  0 04:13 ?        00:00:00 [python] <defunct>
root       332   566  0 05:26 ?        00:00:00 [python] <defunct>
root       334   566  0 03:55 ?        00:00:00 [python] <defunct>
root       353   566  0 Nov01 ?        00:00:00 [python] <defunct>
root       354   566  0 07:33 ?        00:00:00 [python] <defunct>
root       356   566  0 09:07 ?        00:00:00 [python] <defunct>
root       363   566  0 Nov01 ?        00:00:00 [python] <defunct>
root       366   566  0 11:23 ?        00:00:00 [python] <defunct>
root       372   566  0 06:38 ?        00:00:00 [python] <defunct>
root       377   566  0 11:43 ?        00:00:00 [python] <defunct>
root       378   566  0 14:39 ?        00:00:00 [python] <defunct>
root       379   566  0 11:03 ?        00:00:00 [python] <defunct>
root       390   566  0 01:06 ?        00:00:00 [python] <defunct>
root       391   566  0 14:01 ?        00:00:00 [python] <defunct>
root       395   566  0 Nov01 ?        00:00:00 [python] <defunct>      

結果中第3列

566

就是僵屍程序的父程序,那麼看看這個是什麼程序

# top -p 566
top - 15:18:23 up 21:21,  1 user,  load average: 0.56, 0.43, 0.56
Tasks:   1 total,   0 running,   1 sleeping,   0 stopped,   0 zombie
%Cpu(s):  4.5 us,  5.0 sy,  0.0 ni, 90.2 id,  0.1 wa,  0.0 hi,  0.2 si,  0.0 st
KiB Mem :  7732980 total,   800156 free,  3259668 used,  3673156 buff/cache
KiB Swap:        0 total,        0 free,        0 used.  4162612 avail Mem
  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
  566 root      20   0 1254132 538104  14288 S   0.7  7.0  35:53.07 datakit      

發現是一個名為

datakit

的程序,那麼接下來就該去檢視代碼排除問題了

把父程序殺掉,殺掉父程序後,僵屍程序資源自然會被回收

# kill -9 566      

但是問題就這樣結束了嗎,當然沒有,問題原因總歸要找到

重新啟動程序後,自然會重新生成僵屍程序,通過檢視程序樹

最上面的父程序還是為datakit,datakit下面生成了很多的Python僵屍程序

6206 ?        Ssl    0:50 /usr/local/datakit/datakit
 6550 ?        Sl     0:00  \_ /usr/local/datakit/externals/oracle --interval 1m --host <your-oracle-host> --port 1521 --userna
 6721 ?        Z      0:00  \_ [python] <defunct>
 6951 ?        Z      0:00  \_ [python] <defunct>
 7179 ?        Z      0:00  \_ [python] <defunct>
 7356 ?        Z      0:00  \_ [python] <defunct>
 7550 ?        Z      0:00  \_ [python] <defunct>
 7729 ?        Z      0:00  \_ [python] <defunct>
 7920 ?        Z      0:00  \_ [python] <defunct>
 8100 ?        Z      0:00  \_ [python] <defunct>
 8277 ?        Z      0:00  \_ [python] <defunct>
 8466 ?        Z      0:00  \_ [python] <defunct>
 8666 ?        Z      0:00  \_ [python] <defunct>      

既然知道程式datakit會産生很多Python的僵屍程序,那麼就在代碼中,搜尋關于執行Python程序的代碼,而且僵屍程序這麼多,可能是不止一處調用,或者是存在循環調用

果然經過一番搜尋

func buildExternals(outdir, goos, goarch string) {
    curOSArch := runtime.GOOS + "/" + runtime.GOARCH

    for _, ex := range externals {
        l.Debugf("building %s-%s/%s", goos, goarch, ex.name)

        if _, ok := ex.osarchs[curOSArch]; !ok {
            l.Warnf("skip build %s under %s", ex.name, curOSArch)
            continue
        }

        osarch := goos + "/" + goarch
        if _, ok := ex.osarchs[osarch]; !ok {
            l.Warnf("skip build %s under %s", ex.name, osarch)
            continue
        }

        out := ex.name

        switch strings.ToLower(ex.lang) {
        case "go", "golang":

            switch osarch {
            case "windows/amd64", "windows/386":
                out += ".exe"
            default: // pass
            }

            args := []string{
                "go", "build",
                "-o", filepath.Join(outdir, "externals", out),
                "-ldflags",
                "-w -s",
                filepath.Join("plugins", "externals", ex.name, ex.entry),
            }

            ex.envs = append(ex.envs, "GOOS="+goos, "GOARCH="+goarch)

            msg, err := runEnv(args, ex.envs)
            if err != nil {
                l.Fatalf("failed to run %v, envs: %v: %v, msg: %s",
                    args, ex.envs, err, string(msg))
            }
        case "makefile", "Makefile":
            args := []string{
                "make",
                "--file=" + filepath.Join("plugins", "externals", ex.name, ex.entry),
                "OUTPATH=" + filepath.Join(outdir, "externals", out),
                "BASEPATH=" + "plugins/externals/" + ex.name,
            }

            ex.envs = append(ex.envs, "GOOS="+goos, "GOARCH="+goarch)
            msg, err := runEnv(args, ex.envs)
            if err != nil {
                l.Fatalf("failed to run %v, envs: %v: %v, msg: %s",
                    args, ex.envs, err, string(msg))
            }
        default: // for python, just copy source code into build dir
            ex.buildArgs = append(ex.buildArgs, filepath.Join(outdir, "externals"))
            cmd := exec.Command(ex.buildCmd, ex.buildArgs...) //nolint:gosec
            if ex.envs != nil {
                cmd.Env = append(os.Environ(), ex.envs...)
            }

            res, err := cmd.CombinedOutput()
            if err != nil {
                l.Fatalf("failed to build python(%s %s): %s, err: %s",
                    ex.buildCmd, strings.Join(ex.buildArgs, " "), res, err.Error())
            }
        }
    }
}
      

該函數中,for循環語句塊,通過 ​

switch

 分支中的​

default

​ 執行了

exec.Command

,但如果建立的子程序,父程序不知道的話就會産生僵屍程序了

default: // for python, just copy source code into build dir
            ex.buildArgs = append(ex.buildArgs, filepath.Join(outdir, "externals"))
            cmd := exec.Command(ex.buildCmd, ex.buildArgs...) //nolint:gosec
            if ex.envs != nil {
                cmd.Env = append(os.Environ(), ex.envs...)
            }

            res, err := cmd.CombinedOutput()
            if err != nil {
                l.Fatalf("failed to build python(%s %s): %s, err: %s",
                    ex.buildCmd, strings.Join(ex.buildArgs, " "), res, err.Error())
            }      

應該加上以下代碼,防止僵屍程序

if err:=cmd.Wait();err!=nil{
       fmt.Println(err)
   }