From 2b5fe6de58276d0b5a7c884d5dbfc300ca47db78 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 17 Nov 2008 15:40:08 +0100
Subject: thread_group_cputime: move a couple of callsites outside of ->siglock

Impact: relax the locking of cpu-time accounting calls

->siglock buys nothing for thread_group_cputime() in do_sys_times() and
wait_task_zombie() (which btw takes the unrelated parent's ->siglock).

Actually I think do_sys_times() doesn't need ->siglock at all.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index ae2b92b..b9c4d8b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1330,10 +1330,10 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		 * group, which consolidates times for all threads in the
 		 * group including the group leader.
 		 */
+		thread_group_cputime(p, &cputime);
 		spin_lock_irq(&p->parent->sighand->siglock);
 		psig = p->parent->signal;
 		sig = p->signal;
-		thread_group_cputime(p, &cputime);
 		psig->cutime =
 			cputime_add(psig->cutime,
 			cputime_add(cputime.utime,
-- 
cgit v1.1


From 7c0990c7ee988aa193abbb7da3faeb9279146dbf Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Wed, 19 Nov 2008 10:20:23 +0100
Subject: Do not free io context when taking recursive faults in do_exit

When taking recursive faults in do_exit, if the io_context is not null,
exit_io_context() is being called. But it might decrement the refcount
more than once. It is better to leave this task alone.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/exit.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index c7422ca..9a21347 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1037,8 +1037,6 @@ NORET_TYPE void do_exit(long code)
 		 * task into the wait for ever nirwana as well.
 		 */
 		tsk->flags |= PF_EXITPIDONE;
-		if (tsk->io_context)
-			exit_io_context();
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule();
 	}
-- 
cgit v1.1


From e5991371ee0d1c0ce19e133c6f9075b49c5b4ae8 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 6 Jan 2009 14:39:22 -0800
Subject: mm: remove cgroup_mm_owner_callbacks

cgroup_mm_owner_callbacks() was brought in to support the memrlimit
controller, but sneaked into mainline ahead of it.  That controller has
now been shelved, and the mm_owner_changed() args were inadequate for it
anyway (they needed an mm pointer instead of a task pointer).

Remove the dead code, and restore mm_update_next_owner() locking to how it
was before: taking mmap_sem there does nothing for memcontrol.c, now the
only user of mm->owner.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Paul Menage <menage@google.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index c9e5a1c..f923724 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -642,35 +642,31 @@ retry:
 	/*
 	 * We found no owner yet mm_users > 1: this implies that we are
 	 * most likely racing with swapoff (try_to_unuse()) or /proc or
-	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL,
-	 * so that subsystems can understand the callback and take action.
+	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 	 */
-	down_write(&mm->mmap_sem);
-	cgroup_mm_owner_callbacks(mm->owner, NULL);
 	mm->owner = NULL;
-	up_write(&mm->mmap_sem);
 	return;
 
 assign_new_owner:
 	BUG_ON(c == p);
 	get_task_struct(c);
-	read_unlock(&tasklist_lock);
-	down_write(&mm->mmap_sem);
 	/*
 	 * The task_lock protects c->mm from changing.
 	 * We always want mm->owner->mm == mm
 	 */
 	task_lock(c);
+	/*
+	 * Delay read_unlock() till we have the task_lock()
+	 * to ensure that c does not slip away underneath us
+	 */
+	read_unlock(&tasklist_lock);
 	if (c->mm != mm) {
 		task_unlock(c);
-		up_write(&mm->mmap_sem);
 		put_task_struct(c);
 		goto retry;
 	}
-	cgroup_mm_owner_callbacks(mm->owner, c);
 	mm->owner = c;
 	task_unlock(c);
-	up_write(&mm->mmap_sem);
 	put_task_struct(c);
 }
 #endif /* CONFIG_MM_OWNER */
-- 
cgit v1.1


From 901608d9045146aec6f14a7777ea4b1501c379f0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 6 Jan 2009 14:40:29 -0800
Subject: mm: introduce get_mm_hiwater_xxx(), fix taskstats->hiwater_xxx
 accounting

xacct_add_tsk() relies on do_exit()->update_hiwater_xxx() and uses
mm->hiwater_xxx directly, this leads to 2 problems:

- taskstats_user_cmd() can call fill_pid()->xacct_add_tsk() at any
  moment before the task exits, so we should check the current values of
  rss/vm anyway.

- do_exit()->update_hiwater_xxx() calls are racy.  An exiting thread can
  be preempted right before mm->hiwater_xxx = new_val, and another thread
  can use A_LOT of memory and exit in between.  When the first thread
  resumes it can be the last thread in the thread group, in that case we
  report the wrong hiwater_xxx values which do not take A_LOT into
  account.

Introduce get_mm_hiwater_rss() and get_mm_hiwater_vm() helpers and change
xacct_add_tsk() to use them.  The first helper will also be used by
rusage->ru_maxrss accounting.

Kill do_exit()->update_hiwater_xxx() calls.  Unless we are going to
decrease rss/vm there is no point to update mm->hiwater_xxx, and nobody
can look at this mm_struct when exit_mmap() actually unmaps the memory.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel/exit.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index f923724..c7740fa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1051,10 +1051,7 @@ NORET_TYPE void do_exit(long code)
 				preempt_count());
 
 	acct_update_integrals(tsk);
-	if (tsk->mm) {
-		update_hiwater_rss(tsk->mm);
-		update_hiwater_vm(tsk->mm);
-	}
+
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		hrtimer_cancel(&tsk->signal->real_timer);
-- 
cgit v1.1