Chipset: MSM8X25Q
Codebase: Android4.1
Kernel: 3.4.0
概念:
OOMkiller,即out of memory killer,是linux下面的一種管理當記憶體耗盡時的處理機制。當記憶體較少時,OOM會周遊整個程序連結清單,然後根據程序的記憶體使用情況以及它的oom score值最終找到得分較高的程序,然後發送kill信号将其殺掉。
夥伴系統中在配置設定記憶體時會做判斷,當記憶體不足時,會調用核心函數out_of_memory(), 函數位于檔案[email protected]/mm.
下面先分析out_of_memory()。
out_of_memory():
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask, bool force_kill)
{
const nodemask_t *mpol_mask;
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
unsigned int points;
enum oom_constraint constraint = CONSTRAINT_NONE;
int killed = 0;
~~snip
/*如果目前已經有Pending的kill信号,那麼馬上傳回。
畢竟oom最中為了free memory而執行sig kill。*/
if (fatal_signal_pending(current)) {
set_thread_flag(TIF_MEMDIE);
return;
}
~~snip
/*使用者空間可以通過/proc/sys/vm/panic_on_oom來改變oom的行為,
1表示oom的時候直接panic,0就隻殺掉”best”程序而讓系統繼續運作。*/
check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
read_lock(&tasklist_lock);
/*同樣/proc/sys/vm/ oom_kill_allocating_task為true時表示直接将目前配置設定的task
給kill掉。*/
if (sysctl_oom_kill_allocating_task &&
!oom_unkillable_task(current, NULL, nodemask) &&
current->mm) {
oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
nodemask,
"Out of memory (oom_kill_allocating_task)");
goto out;
}
/*根據目前task的記憶體以oom score資訊得到point值最高的那個。*/
p = select_bad_process(&points, totalpages, NULL, mpol_mask,
force_kill);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
read_unlock(&tasklist_lock);
panic("Out of memory and no killable processes...\n");
}
if (PTR_ERR(p) != -1UL) {
/*唔,被殺了,苦逼!*/
oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
nodemask, "Out of memory");
killed = 1;
}
out:
read_unlock(&tasklist_lock);
/*
* Give "p" a good chance of killing itself before we
* retry to allocate memory unless "p" is current
*/
if (killed && !test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}
select_bad_process():
static struct task_struct *select_bad_process(unsigned int *ppoints,
unsigned long totalpages, struct mem_cgroup *memcg,
const nodemask_t *nodemask, bool force_kill)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
*ppoints = 0;
/*周遊所有程序*/
do_each_thread(g, p) {
unsigned int points;
/*處于退出的程序就不管了*/
if (p->exit_state)
continue;
/*有些核心的線程不能殺,如init, kernel_thread*/
if (oom_unkillable_task(p, memcg, nodemask))
continue;
/*正在被oom killing的程序也不管。*/
if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
if (unlikely(frozen(p)))
__thaw_task(p);
if (!force_kill)
return ERR_PTR(-1UL);
}
if (!p->mm)
continue;
if (p->flags & PF_EXITING) {
if (p == current) {
chosen = p;
*ppoints = 1000;
} else if (!force_kill) {
/*
* If this task is not being ptraced on exit,
* then wait for it to finish before killing
* some other task unnecessarily.
*/
if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
return ERR_PTR(-1UL);
}
}
/*計算task對應的points*/
points = oom_badness(p, memcg, nodemask, totalpages);
/*如果此task比上次的points要大,那麼儲存point.*/
if (points > *ppoints) {
chosen = p;
*ppoints = points;
}
} while_each_thread(g, p);
return chosen;
}
oom_badness():
unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
const nodemask_t *nodemask, unsigned long totalpages)
{
long points;
if (oom_unkillable_task(p, memcg, nodemask))
return 0;
p = find_lock_task_mm(p);
if (!p)
return 0;
/*oom_score_adj為-1000的不做處理,此值可以通過/proc/pid_num/oom_score_adj設定,範圍為-1000 ~ 1000,值越大越容易被oom kill掉。*/
if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
task_unlock(p);
return 0;
}
/*
* The memory controller may have a limit of 0 bytes, so avoid a divide
* by zero, if necessary.
*/
if (!totalpages)
totalpages = 1;
/* get_mm_rss擷取目前使用者空間使用檔案和匿名頁占有記憶體數,nr_ptes 擷取
目前儲存頁表使用的記憶體。*/
points = get_mm_rss(p->mm) + p->mm->nr_ptes;
/*擷取交換記憶體使用的記憶體數*/
points += get_mm_counter(p->mm, MM_SWAPENTS);
/*每個task同等計算,可不管。*/
points *= 1000;
points /= totalpages;
task_unlock(p);
/*當該程序具有CAP_SYS_ADMIN能力,那麼Point降低,因為具有ADMIN權限的
Task是被認為表現良好的。 */
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
points -= 30;
/*加上oom_score_adj,範圍從-1000 ~ 1000. */
points += p->signal->oom_score_adj;
/*
* Never return 0 for an eligible task that may be killed since it's
* possible that no single user task uses more than 0.1% of memory and
* no single admin tasks uses more than 3.0%.
*/
if (points <= 0)
return 1;
/*1000封頂*/
return (points < 1000) ? points : 1000;
}
oom_kill_process():
static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
unsigned int points, unsigned long totalpages,
struct mem_cgroup *memcg, nodemask_t *nodemask,
const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
struct task_struct *t = p;
struct mm_struct *mm;
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
if (p->flags & PF_EXITING) {
set_tsk_thread_flag(p, TIF_MEMDIE);
return;
}
if (__ratelimit(&oom_rs))
dump_header(p, gfp_mask, order, memcg, nodemask);
task_lock(p);
pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
message, task_pid_nr(p), p->comm, points);
task_unlock(p);
/*目前被標明子程序的mm和父程序不一樣時,找到其中最高point
的children task,然後替代父程序被殺掉,是以當一個程序有多個子程序并且
真用較多記憶體時,子程序有可能被殺掉,而父程序還可以活着。 */
do {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
if (child->mm == p->mm)
continue;
/*
* oom_badness() returns 0 if the thread is unkillable
*/
child_points = oom_badness(child, memcg, nodemask,
totalpages);
if (child_points > victim_points) {
victim = child;
victim_points = child_points;
}
}
} while_each_thread(p, t);
victim = find_lock_task_mm(victim);
if (!victim)
return;
/* mm cannot safely be dereferenced after task_unlock(victim) */
mm = victim->mm;
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
K(get_mm_counter(victim->mm, MM_FILEPAGES)));
task_unlock(victim);
/*
隻要mm是一樣的,也就是說共享記憶體的程序,都會和目前找到最高point的
指定程序一起被殺掉。 */
for_each_process(p)
if (p->mm == mm && !same_thread_group(p, victim) &&
!(p->flags & PF_KTHREAD)) {
if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
continue;
task_lock(p); /* Protect ->comm from prctl() */
pr_err("Kill process %d (%s) sharing same memory\n",
task_pid_nr(p), p->comm);
task_unlock(p);
/*發送 SIGKILL信号。*/
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
}
set_tsk_thread_flag(victim, TIF_MEMDIE);
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
}
是以,out_of_memory()做的任務就是周遊系統全部程序,然後根據記憶體使用情況以及oom_score_adj的值計算得到一個point, 最終将最高point的task給kill掉。
相關知識:
1. Malloc會引起OOM killer,可參考:
http://blog.dccmx.com/2011/04/oom-killer-on-linux
2. OOM killer值是管理計算lowmemory部分,即使High memory有很多空閑記憶體。
3. 程序rss的計算可參考此文:
http://filwmm1314.blog.163.com/blog/static/2182591920121016541582/
4. 影響到oom killer行為的檔案有:
/proc/sys/vm/overcommit_memory
/proc/sys/vm/panic_on_oom
/proc/sys/vm/oom_kill_allocating_task
/porc/pid_xxx/oom_score_adj
2013/04/27