天天看点

kernel 系统调用----system call

Init

在trap_init中对SYSCALL_VECTOR(编号0x80)的向量进行初始化。

set_system_trap_gate(SYSCALL_VECTOR, &system_call);
           

将system call初始化为trap门,加入到IDT table中,发生中断以后,会跳转到对应system_call的地址去执行后续的中断流程。发生中断到跳转执行中断向量的过程在kernel 中断分析三——中断处理流程有详细解释,本篇只关注system_call的运行过程。

ENTRY(system_call)

/*
   * syscall stub including irq exit should be protected against kprobes
   */
      .pushsection .kprobes.text, "ax"
      # system call handler stub
  ENTRY(system_call)
      RING0_INT_FRAME         # can't unwind into user space anyway
      ASM_CLAC
      pushl_cfi %eax          # save orig_eax  --------------1
      SAVE_ALL                                    -----------
      GET_THREAD_INFO(%ebp)                       -----------
                      # system call tracing in operation / emulation
      testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) ---------
      jnz syscall_trace_entry
      cmpl $(NR_syscalls), %eax
      jae syscall_badsys
  syscall_call:                                      --------
      call *sys_call_table(,%eax,)
  syscall_after_call:                                --------
      movl %eax,PT_EAX(%esp)      # store the return value
  syscall_exit:                                      --------
      LOCKDEP_SYS_EXIT
      DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
                      # setting need_resched or sigpending
                      # between sampling and the iret
      TRACE_IRQS_OFF
      movl TI_flags(%ebp), %ecx
      testl $_TIF_ALLWORK_MASK, %ecx  # current->work
      jne syscall_exit_work
 
           
  1. RING0_INT_FRAME设置esp、eip指向内核态,然后将eax中的系统调用号入栈
  2. 保存现场,即用户态的一些寄存器值
  3. 将thread_info的地址保存到ebp寄存器
  4. 当前进程是否有被trace,如果有就执行相关的动作保存当时的追踪信息
  5. 调用对应的系统调用函数
  6. 将返回值入栈
  7. 屏蔽其他中断。检测当前进程是否还有工作没有完成,如果有,那么跳转到syscall_exit_work
  8. 然后恢复userspace被压入栈的寄存器,返回userspace
restore_all:
      TRACE_IRQS_IRET
  restore_all_notrace:
  #ifdef CONFIG_X86_ESPFIX32
      movl PT_EFLAGS(%esp), %eax  # mix EFLAGS, SS and CS     ------------1
      # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
      # are returning to the kernel.
      # See comments in process.c:copy_thread() for details.
      movb PT_OLDSS(%esp), %ah
      movb PT_CS(%esp), %al
      andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << ) | SEGMENT_RPL_MASK), %eax
      cmpl $((SEGMENT_LDT << ) | USER_RPL), %eax
      CFI_REMEMBER_STATE
      je ldt_ss           # returning to user-space with LDT SS
  #endif
  restore_nocheck:
      RESTORE_REGS           # skip orig_eax/error_code
  irq_return:
      INTERRUPT_RETURN                            ----------
  .section .fixup,"ax"
  ENTRY(iret_exc)
      pushl $            # no error code
      pushl $do_iret_error
      jmp error_code
  .previous
      _ASM_EXTABLE(irq_return,iret_exc)
 
  #ifdef CONFIG_X86_ESPFIX32
      CFI_RESTORE_STATE
  ldt_ss:
  #ifdef CONFIG_PARAVIRT
      /*
       * The kernel can't run on a non-flat stack if paravirt mode
       * is active.  Rather than try to fixup the high bits of
       * ESP, bypass this code entirely.  This may break DOSemu
       * and/or Wine support in a paravirt VM, although the option
       * is still available to implement the setting of the high
       * -bits in the INTERRUPT_RETURN paravirt-op.
       */
      cmpl $, pv_info+PARAVIRT_enabled
      jne restore_nocheck
  #endif
 
  /*
   * Setup and switch to ESPFIX stack
   *
   * We're returning to userspace with a  bit stack. The CPU will not
   * restore the high word of ESP for us on executing iret... This is an
   * "official" bug of all the x86-compatible CPUs, which we can work
   * around to make dosemu and wine happy. We do this by preloading the
   * high word of ESP with the high word of the userspace ESP while
   * compensating for the offset by changing to the ESPFIX segment with
   * a base address that matches for the difference.
   */
  #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
      mov %esp, %edx          /* load kernel esp */
      mov PT_OLDESP(%esp), %eax   /* load userspace esp */
      mov %dx, %ax            /* eax: new kernel esp */
      sub %eax, %edx          /* offset (low word is ) */
      shr $, %edx
      mov %dl, GDT_ESPFIX_SS +  /* bits  */
      mov %dh, GDT_ESPFIX_SS +  /* bits  */
      pushl_cfi $__ESPFIX_SS
      pushl_cfi %eax          /* new kernel esp */
      /* Disable interrupts, but do not irqtrace this section: we
       * will soon execute iret and the tracer was already set to
       * the irqstate after the iret */
      DISABLE_INTERRUPTS(CLBR_EAX)
      lss (%esp), %esp        /* switch to espfix segment */
      CFI_ADJUST_CFA_OFFSET -
      jmp restore_nocheck
  #endif
      CFI_ENDPROC
  ENDPROC(system_call)
           

syscall_exit_work

_TIF_ALLWORK_MASK 的定义如下:

/* Work to do on any return to user space. */
 #define _TIF_ALLWORK_MASK \
   (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\
    _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)
           

当以下情况之一发生时,返回用户态之前需要进入syscall_exit_work处理:

1. 当前进程有信号pending

2. 当前进程需要被重新调度

3. 设置了_TIF_SINGLESTEP,restore singlestep on return to user mode

4. got an async TLB fault in kernel

5. callback before returning to user

syscall_exit_work:
      testl $_TIF_WORK_SYSCALL_EXIT, %ecx----------
      jz work_pending
      TRACE_IRQS_ON
      ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
                      # schedule() instead
      movl %esp, %eax
      call syscall_trace_leave
      jmp resume_userspace------------------------
  END(syscall_exit_work)
           
  1. 检测是否有work pending
  2. 否则开中断然后返回用户态
work_pending:
      testb $_TIF_NEED_RESCHED, %cl     -------------
      jz work_notifysig                 -------------
  work_resched:                         -------------     
      call schedule
      LOCKDEP_SYS_EXIT
      DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
                      # setting need_resched or sigpending
                      # between sampling and the iret
      TRACE_IRQS_OFF
      movl TI_flags(%ebp), %ecx
      andl $_TIF_WORK_MASK, %ecx  # is there any work to be done other
                      # than syscall tracing?
      jz restore_all
      testb $_TIF_NEED_RESCHED, %cl
      jnz work_resched
 
  work_notifysig:             # deal with pending signals and-------------------4
                      # notify-resume requests
  #ifdef CONFIG_VM86
      testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
      movl %esp, %eax
      jne work_notifysig_v86      # returning to kernel-space or
                      # vm86-space
  :
  #else
      movl %esp, %eax
  #endif
      TRACE_IRQS_ON
      ENABLE_INTERRUPTS(CLBR_NONE)
      movb PT_CS(%esp), %bl
      andb $SEGMENT_RPL_MASK, %bl
      cmpb $USER_RPL, %bl
      jb resume_kernel
      xorl %edx, %edx
      call do_notify_resume -------------------
      jmp resume_userspace
 
  #ifdef CONFIG_VM86
      ALIGN
  work_notifysig_v86:
      pushl_cfi %ecx          # save ti_flags for do_notify_resume
      call save_v86_state     # %eax contains pt_regs pointer
      popl_cfi %ecx
      movl %eax, %esp
      jmp b
  #endif
  END(work_pending)
           
  1. 检测_TIF_NEED_RESCHED,若被设置,跳转到work_resched,否则跳转到work_notifysig,进行信号处理
  2. 调用schedule主动让出CPU
  3. 处理pending的信号,具体的处理流程在do_notify_resume 中的do_signal

整个处理流程用流程图表现得更加直观:

kernel 系统调用----system call

继续阅读