今天被拉過來加班處理性能問題:

優化後對比的結果為:同樣在5wcps的情況下,以前的cpu 使用率為90%, 現在cpu使用率為30%! 從cpu 角度看提高了很多, 同時perf top 結果看, close系統調用所占cpu也降低了不少
由于之前采用多線程架構存在如下問題:
1、批量的close系統調用導緻問題;此處是業務沒有處理好,同時close确實存在vfs的lock沖突
2、 多線程批量accept open fd 也會觸發vfs的全局鎖
/*
* cloning flags:
*/
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
#define CLONE_FS 0x00000200 /* set if fs info shared between processes 每個程序都有自己的根目錄和目前工作目錄,核心使用struct fs_struct來記錄這些資訊,程序描述符的fs字段便是指向該程序的fs_struct結構。*/
#define CLONE_FILES 0x00000400 /* set if open files shared between processes 程序還需要記錄自己打開的檔案。程序已經打開的所有檔案使用struct files_struct來記錄*/
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
#define CLONE_THREAD 0x00010000 /* Same thread group? */
#define CLONE_NEWNS 0x00020000 /* New namespace group? */
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
and is now available for re-use. */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
int sys_fork(struct pt_regs *regs)
{
return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
}
/*
* This is trivial, and on the face of it looks like it
* could equally well be done in user mode.
*
* Not so, for quite unobvious reasons - register pressure.
* In user mode vfork() cannot have a stack frame, and if
* done by calling the "clone()" system call directly, you
* do not have enough call-clobbered registers to hold all
* the information you need.
*/
int sys_vfork(struct pt_regs *regs)
{
return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
NULL, NULL);
}
long
sys_clone(unsigned long clone_flags, unsigned long newsp,
void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
{
if (!newsp)
newsp = regs->sp;
return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}
一般來說 fork 後父子程序隔離開
vfork:父子程序共享mm;vfork系統調用不同于fork,用vfork建立的子程序共享位址空間,也就是說子程序完全運作在父程序的位址空間上,子程序對虛拟位址空間任何資料的修改同樣為父程序所見。但是用vfork建立子程序後,父程序會被阻塞直到子程序調用exec或exit。這樣的好處是在子進
程被建立後僅僅是為了調用exec執行另一個程式時,因為它就不會對父程序的位址空間有任何引用,是以對位址空間的複制是多餘的,通過vfork可以減少不必要的開銷。
pthread_create:父子程序共享主要 mm fs file signal 等資源
是以在open以及close fd是 全局的files檔案鎖需要加鎖;
同時參考:這篇文章的分析;對比下面兩張圖: 發現一個close為30us一個為ns級别(顯示0us)
close系統調用分析
SYSCALL_DEFINE1(close, unsigned int, fd)
{
struct file * filp;
struct files_struct *files = current->files;
struct fdtable *fdt;
int retval;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (fd >= fdt->max_fds)
goto out_unlock;
filp = fdt->fd[fd];
if (!filp)
goto out_unlock;
rcu_assign_pointer(fdt->fd[fd], NULL);
FD_CLR(fd, fdt->close_on_exec);
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
retval = filp_close(filp, files);
/* can't restart close syscall because file table entry was cleared */
if (unlikely(retval == -ERESTARTSYS ||
retval == -ERESTARTNOINTR ||
retval == -ERESTARTNOHAND ||
retval == -ERESTART_RESTARTBLOCK))
retval = -EINTR;
return retval;
out_unlock:
spin_unlock(&files->file_lock);
return -EBADF;
}
/*
* "id" is the POSIX thread ID. We use the
* files pointer for this..
*/
int filp_close(struct file *filp, fl_owner_t id)
{
int retval = 0;
if (!file_count(filp)) {
printk(KERN_ERR "VFS: Close: file count is 0\n");
return 0;
}
if (filp->f_op && filp->f_op->flush)
retval = filp->f_op->flush(filp, id);
if (likely(!(filp->f_mode & FMODE_PATH))) {
dnotify_flush(filp, id);
locks_remove_posix(filp, id);
}
fput(filp);
return retval;
}
void fput(struct file *file)
{//注意下fput函數,該函數會先現将檔案的引用計數-1,然後判斷是否為0,為0的時候才會進行繼續的流程,也就是說當socket存在多個引用的時候,隻有最後一個close才會觸發後面的排程銷毀流程,
if (atomic_long_dec_and_test(&file->f_count))
__fput(file);
}
/* the real guts of fput() - releasing the last reference to file
*/
static void __fput(struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
struct vfsmount *mnt = file->f_path.mnt;
struct inode *inode = dentry->d_inode;
might_sleep();
fsnotify_close(file);
/*
* The function eventpoll_release() should be the first called
* in the file cleanup chain.
*/
eventpoll_release(file);
locks_remove_flock(file);
if (unlikely(file->f_flags & FASYNC)) {
if (file->f_op && file->f_op->fasync)
file->f_op->fasync(-1, file, 0);
}
if (file->f_op && file->f_op->release)
file->f_op->release(inode, file);//在close系統調用中會調用檔案的release操作
security_file_free(file);
ima_file_free(file);
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
!(file->f_mode & FMODE_PATH))) {
cdev_put(inode->i_cdev);
}
fops_put(file->f_op);
put_pid(file->f_owner.pid);
file_sb_list_del(file);
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_dec(inode);
if (file->f_mode & FMODE_WRITE)
drop_file_write_access(file);
file->f_path.dentry = NULL;
file->f_path.mnt = NULL;
file_free(file);
dput(dentry);
mntput(mnt);
}
在close系統調用中會調用檔案的release操作;socket實作的檔案操作結構如下所示,其中本文讨論的release函數實作為sock_close;
/*
* Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
* in the operation structures but are done directly via the socketcall() multiplexor.
*/
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.aio_read = sock_aio_read,
.aio_write = sock_aio_write,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.open = sock_no_open, /* special open code to disallow open via /proc */
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
};
static int sock_close(struct inode *inode, struct file *filp)
{
/*
* It was possible the inode is NULL we were
* closing an unfinished socket.
*/
if (!inode) {
printk(KERN_DEBUG "sock_close: NULL inode\n");
return 0;
}
sock_release(SOCKET_I(inode));
return 0;
}
/**
* sock_release - close a socket
* @sock: socket to close
*
* The socket is released from the protocol stack if it has a release
* callback, and the inode is then released if the socket is bound to
* an inode not a file.
*/
void sock_release(struct socket *sock)
{
if (sock->ops) {
struct module *owner = sock->ops->owner;
sock->ops->release(sock);/* 調用socket操作中的release 目前來看主要是對用 inet_release 如果使用tcp sock 最後
調用tcp_close*/
sock->ops = NULL;
module_put(owner);
}
if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
printk(KERN_ERR "sock_release: fasync list not empty!\n");
percpu_sub(sockets_in_use, 1);/* 減少cpu的套接口數量 */
if (!sock->file) {
iput(SOCK_INODE(sock));
return;
}
sock->file = NULL; /* 套接口完成關閉,繼續執行close系統調用其他流程 */
}
http代理伺服器(3-4-7層代理)-網絡事件庫公共元件、核心kernel驅動 攝像頭驅動 tcpip網絡協定棧、netfilter、bridge 好像看過!!!!
但行好事 莫問前程
--身高體重180的胖子