close系統調用分析-性能優化

今天被拉過來加班處理性能問題：

優化後對比的結果為：同樣在5wcps的情況下，以前的cpu 使用率為90%，現在cpu使用率為30%！從cpu 角度看提高了很多，同時perf top 結果看， close系統調用所占cpu也降低了不少

由于之前采用多線程架構存在如下問題：

1、批量的close系統調用導緻問題；此處是業務沒有處理好，同時close确實存在vfs的lock沖突

2、多線程批量accept open fd 也會觸發vfs的全局鎖

/*
 * cloning flags:
 */
#define CSIGNAL        0x000000ff    /* signal mask to be sent at exit */
#define CLONE_VM    0x00000100    /* set if VM shared between processes */
#define CLONE_FS    0x00000200    /* set if fs info shared between processes 每個程序都有自己的根目錄和目前工作目錄，核心使用struct fs_struct來記錄這些資訊，程序描述符的fs字段便是指向該程序的fs_struct結構。*/
#define CLONE_FILES    0x00000400    /* set if open files shared between processes 程序還需要記錄自己打開的檔案。程序已經打開的所有檔案使用struct files_struct來記錄*/
#define CLONE_SIGHAND    0x00000800    /* set if signal handlers and blocked signals shared */
#define CLONE_PTRACE    0x00002000    /* set if we want to let tracing continue on the child too */
#define CLONE_VFORK    0x00004000    /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT    0x00008000    /* set if we want to have the same parent as the cloner */
#define CLONE_THREAD    0x00010000    /* Same thread group? */
#define CLONE_NEWNS    0x00020000    /* New namespace group? */
#define CLONE_SYSVSEM    0x00040000    /* share system V SEM_UNDO semantics */
#define CLONE_SETTLS    0x00080000    /* create a new TLS for the child */
#define CLONE_PARENT_SETTID    0x00100000    /* set the TID in the parent */
#define CLONE_CHILD_CLEARTID    0x00200000    /* clear the TID in the child */
#define CLONE_DETACHED        0x00400000    /* Unused, ignored */
#define CLONE_UNTRACED        0x00800000    /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID    0x01000000    /* set the TID in the child */
/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
   and is now available for re-use. */
#define CLONE_NEWUTS        0x04000000    /* New utsname group? */
#define CLONE_NEWIPC        0x08000000    /* New ipcs */
#define CLONE_NEWUSER        0x10000000    /* New user namespace */
#define CLONE_NEWPID        0x20000000    /* New pid namespace */
#define CLONE_NEWNET        0x40000000    /* New network namespace */
#define CLONE_IO        0x80000000    /* Clone io context */

int sys_fork(struct pt_regs *regs)
{
    return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
}

/*
 * This is trivial, and on the face of it looks like it
 * could equally well be done in user mode.
 *
 * Not so, for quite unobvious reasons - register pressure.
 * In user mode vfork() cannot have a stack frame, and if
 * done by calling the "clone()" system call directly, you
 * do not have enough call-clobbered registers to hold all
 * the information you need.
 */
int sys_vfork(struct pt_regs *regs)
{
    return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
               NULL, NULL);
}

long
sys_clone(unsigned long clone_flags, unsigned long newsp,
      void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
{
    if (!newsp)
        newsp = regs->sp;
    return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}

一般來說 fork 後父子程序隔離開

vfork：父子程序共享mm；vfork系統調用不同于fork，用vfork建立的子程序共享位址空間，也就是說子程序完全運作在父程序的位址空間上，子程序對虛拟位址空間任何資料的修改同樣為父程序所見。但是用vfork建立子程序後，父程序會被阻塞直到子程序調用exec或exit。這樣的好處是在子進

程被建立後僅僅是為了調用exec執行另一個程式時，因為它就不會對父程序的位址空間有任何引用，是以對位址空間的複制是多餘的，通過vfork可以減少不必要的開銷。

pthread_create：父子程序共享主要 mm fs file signal 等資源

是以在open以及close fd是全局的files檔案鎖需要加鎖；

同時參考：這篇文章的分析;對比下面兩張圖：發現一個close為30us一個為ns級别（顯示0us）

close系統調用分析

SYSCALL_DEFINE1(close, unsigned int, fd)
{
    struct file * filp;
    struct files_struct *files = current->files;
    struct fdtable *fdt;
    int retval;

    spin_lock(&files->file_lock);
    fdt = files_fdtable(files);
    if (fd >= fdt->max_fds)
        goto out_unlock;
    filp = fdt->fd[fd];
    if (!filp)
        goto out_unlock;
    rcu_assign_pointer(fdt->fd[fd], NULL);
    FD_CLR(fd, fdt->close_on_exec);
    __put_unused_fd(files, fd);
    spin_unlock(&files->file_lock);
    
    retval = filp_close(filp, files);

    /* can't restart close syscall because file table entry was cleared */
    if (unlikely(retval == -ERESTARTSYS ||
             retval == -ERESTARTNOINTR ||
             retval == -ERESTARTNOHAND ||
             retval == -ERESTART_RESTARTBLOCK))
        retval = -EINTR;

    return retval;

out_unlock:
    spin_unlock(&files->file_lock);
    return -EBADF;
}

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
int filp_close(struct file *filp, fl_owner_t id)
{
    int retval = 0;

    if (!file_count(filp)) {
        printk(KERN_ERR "VFS: Close: file count is 0\n");
        return 0;
    }

    if (filp->f_op && filp->f_op->flush)
        retval = filp->f_op->flush(filp, id);

    if (likely(!(filp->f_mode & FMODE_PATH))) {
        dnotify_flush(filp, id);
        locks_remove_posix(filp, id);
    }
    fput(filp);
    return retval;
}
void fput(struct file *file)
{//注意下fput函數，該函數會先現将檔案的引用計數-1，然後判斷是否為0，為0的時候才會進行繼續的流程，也就是說當socket存在多個引用的時候，隻有最後一個close才會觸發後面的排程銷毀流程，
    if (atomic_long_dec_and_test(&file->f_count))
        __fput(file);
}
/* the real guts of fput() - releasing the last reference to file
 */
static void __fput(struct file *file)
{
    struct dentry *dentry = file->f_path.dentry;
    struct vfsmount *mnt = file->f_path.mnt;
    struct inode *inode = dentry->d_inode;

    might_sleep();

    fsnotify_close(file);
    /*
     * The function eventpoll_release() should be the first called
     * in the file cleanup chain.
     */
    eventpoll_release(file);
    locks_remove_flock(file);

    if (unlikely(file->f_flags & FASYNC)) {
        if (file->f_op && file->f_op->fasync)
            file->f_op->fasync(-1, file, 0);
    }
    if (file->f_op && file->f_op->release)
        file->f_op->release(inode, file);//在close系統調用中會調用檔案的release操作
    security_file_free(file);
    ima_file_free(file);
    if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
             !(file->f_mode & FMODE_PATH))) {
        cdev_put(inode->i_cdev);
    }
    fops_put(file->f_op);
    put_pid(file->f_owner.pid);
    file_sb_list_del(file);
    if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
        i_readcount_dec(inode);
    if (file->f_mode & FMODE_WRITE)
        drop_file_write_access(file);
    file->f_path.dentry = NULL;
    file->f_path.mnt = NULL;
    file_free(file);
    dput(dentry);
    mntput(mnt);
}

在close系統調用中會調用檔案的release操作；socket實作的檔案操作結構如下所示，其中本文讨論的release函數實作為sock_close；

/*
 *    Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
 *    in the operation structures but are done directly via the socketcall() multiplexor.
 */

static const struct file_operations socket_file_ops = {
    .owner =    THIS_MODULE,
    .llseek =    no_llseek,
    .aio_read =    sock_aio_read,
    .aio_write =    sock_aio_write,
    .poll =        sock_poll,
    .unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
    .compat_ioctl = compat_sock_ioctl,
#endif
    .mmap =        sock_mmap,
    .open =        sock_no_open,    /* special open code to disallow open via /proc */
    .release =    sock_close,
    .fasync =    sock_fasync,
    .sendpage =    sock_sendpage,
    .splice_write = generic_splice_sendpage,
    .splice_read =    sock_splice_read,
};

static int sock_close(struct inode *inode, struct file *filp)
{
    /*
     *      It was possible the inode is NULL we were
     *      closing an unfinished socket.
     */

    if (!inode) {
        printk(KERN_DEBUG "sock_close: NULL inode\n");
        return 0;
    }
    sock_release(SOCKET_I(inode));
    return 0;
}

/**
 *    sock_release    -    close a socket
 *    @sock: socket to close
 *
 *    The socket is released from the protocol stack if it has a release
 *    callback, and the inode is then released if the socket is bound to
 *    an inode not a file.
 */

void sock_release(struct socket *sock)
{
    if (sock->ops) {
        struct module *owner = sock->ops->owner;

        sock->ops->release(sock);/* 調用socket操作中的release 目前來看主要是對用 inet_release 如果使用tcp sock 最後
        調用tcp_close*/ 
        sock->ops = NULL;
        module_put(owner);
    }

    if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
        printk(KERN_ERR "sock_release: fasync list not empty!\n");

    percpu_sub(sockets_in_use, 1);/* 減少cpu的套接口數量 */
    if (!sock->file) {
        iput(SOCK_INODE(sock));
        return;
    }
    sock->file = NULL; /* 套接口完成關閉，繼續執行close系統調用其他流程 */
}

http代理伺服器（3-4-7層代理）-網絡事件庫公共元件、核心kernel驅動攝像頭驅動 tcpip網絡協定棧、netfilter、bridge 好像看過！！！！

但行好事莫問前程

--身高體重180的胖子