From 9d1b79b7848e13e1bf80b736671f76144cc508d4 Mon Sep 17 00:00:00 2001
From: Marcin Juszkiewicz <marcin@juszkiewicz.com.pl>
Date: Wed, 21 Oct 2009 12:22:20 +0200
Subject: linux 2.6.23: keep sched-cfs locally updated to 2.6.23.17

---
 .../linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch  | 8567 ++++++++++++++++++++
 recipes/linux/linux_2.6.23.bb                      |    4 +-
 2 files changed, 8569 insertions(+), 2 deletions(-)
 create mode 100644 recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch

diff --git a/recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch b/recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch
new file mode 100644
index 0000000000..77ee5c8f1d
--- /dev/null
+++ b/recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch
@@ -0,0 +1,8567 @@
+---
+ Documentation/sched-design-CFS.txt |   67 +
+ Makefile                           |    2 
+ arch/i386/Kconfig                  |   11 
+ drivers/kvm/kvm.h                  |   10 
+ fs/pipe.c                          |    9 
+ fs/proc/array.c                    |   21 
+ fs/proc/base.c                     |    2 
+ fs/proc/proc_misc.c                |   15 
+ include/linux/cgroup.h             |   12 
+ include/linux/cpuset.h             |    5 
+ include/linux/kernel.h             |    7 
+ include/linux/kernel_stat.h        |    3 
+ include/linux/nodemask.h           |   94 +
+ include/linux/sched.h              |  174 ++
+ include/linux/taskstats.h          |    7 
+ include/linux/topology.h           |    5 
+ init/Kconfig                       |   26 
+ init/main.c                        |    3 
+ kernel/delayacct.c                 |    8 
+ kernel/exit.c                      |    6 
+ kernel/fork.c                      |    5 
+ kernel/ksysfs.c                    |    8 
+ kernel/sched.c                     | 2310 +++++++++++++++++++++++--------------
+ kernel/sched_debug.c               |  289 +++-
+ kernel/sched_fair.c                |  885 ++++++--------
+ kernel/sched_idletask.c            |   26 
+ kernel/sched_rt.c                  |   54 
+ kernel/sched_stats.h               |   40 
+ kernel/sysctl.c                    |   40 
+ kernel/timer.c                     |    7 
+ kernel/tsacct.c                    |    4 
+ kernel/user.c                      |  249 +++
+ mm/memory_hotplug.c                |    7 
+ mm/page_alloc.c                    |   50 
+ mm/vmscan.c                        |    4 
+ net/unix/af_unix.c                 |    4 
+ 36 files changed, 2883 insertions(+), 1586 deletions(-)
+
+--- linux-2.6.23.orig/Documentation/sched-design-CFS.txt
++++ linux-2.6.23/Documentation/sched-design-CFS.txt
+@@ -115,5 +115,72 @@ Some implementation details:
+  - reworked/sanitized SMP load-balancing: the runqueue-walking
+    assumptions are gone from the load-balancing code now, and
+    iterators of the scheduling modules are used. The balancing code got
+    quite a bit simpler as a result.
+ 
++
++Group scheduler extension to CFS
++================================
++
++Normally the scheduler operates on individual tasks and strives to provide
++fair CPU time to each task. Sometimes, it may be desirable to group tasks
++and provide fair CPU time to each such task group. For example, it may
++be desirable to first provide fair CPU time to each user on the system
++and then to each task belonging to a user.
++
++CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
++SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
++groups. At present, there are two (mutually exclusive) mechanisms to group
++tasks for CPU bandwidth control purpose:
++
++	- Based on user id (CONFIG_FAIR_USER_SCHED)
++		In this option, tasks are grouped according to their user id.
++	- Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
++		This options lets the administrator create arbitrary groups
++		of tasks, using the "cgroup" pseudo filesystem. See
++		Documentation/cgroups.txt for more information about this
++		filesystem.
++
++Only one of these options to group tasks can be chosen and not both.
++
++Group scheduler tunables:
++
++When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
++each new user and a "cpu_share" file is added in that directory.
++
++	# cd /sys/kernel/uids
++	# cat 512/cpu_share		# Display user 512's CPU share
++	1024
++	# echo 2048 > 512/cpu_share	# Modify user 512's CPU share
++	# cat 512/cpu_share		# Display user 512's CPU share
++	2048
++	#
++
++CPU bandwidth between two users are divided in the ratio of their CPU shares.
++For ex: if you would like user "root" to get twice the bandwidth of user
++"guest", then set the cpu_share for both the users such that "root"'s
++cpu_share is twice "guest"'s cpu_share
++
++
++When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
++for each group created using the pseudo filesystem. See example steps
++below to create task groups and modify their CPU share using the "cgroups"
++pseudo filesystem
++
++	# mkdir /dev/cpuctl
++	# mount -t cgroup -ocpu none /dev/cpuctl
++	# cd /dev/cpuctl
++
++	# mkdir multimedia	# create "multimedia" group of tasks
++	# mkdir browser		# create "browser" group of tasks
++
++	# #Configure the multimedia group to receive twice the CPU bandwidth
++	# #that of browser group
++
++	# echo 2048 > multimedia/cpu.shares
++	# echo 1024 > browser/cpu.shares
++
++	# firefox &	# Launch firefox and move it to "browser" group
++	# echo <firefox_pid> > browser/tasks
++
++	# #Launch gmplayer (or your favourite movie player)
++	# echo <movie_player_pid> > multimedia/tasks
+--- linux-2.6.23.orig/Makefile
++++ linux-2.6.23/Makefile
+@@ -1,9 +1,9 @@
+ VERSION = 2
+ PATCHLEVEL = 6
+ SUBLEVEL = 23
+-EXTRAVERSION = .17
++EXTRAVERSION = .17-cfs-v24.1
+ NAME = Arr Matey! A Hairy Bilge Rat!
+ 
+ # *DOCUMENTATION*
+ # To see a list of typical targets execute "make help"
+ # More info can be located in ./README
+--- linux-2.6.23.orig/arch/i386/Kconfig
++++ linux-2.6.23/arch/i386/Kconfig
+@@ -212,10 +212,21 @@ config X86_ES7000
+ 	  Only choose this option if you have such a system, otherwise you
+ 	  should say N here.
+ 
+ endchoice
+ 
++config SCHED_NO_NO_OMIT_FRAME_POINTER
++	bool "Single-depth WCHAN output"
++	default y
++	help
++	  Calculate simpler /proc/<PID>/wchan values. If this option
++	  is disabled then wchan values will recurse back to the
++	  caller function. This provides more accurate wchan values,
++	  at the expense of slightly more scheduling overhead.
++
++	  If in doubt, say "Y".
++
+ config PARAVIRT
+ 	bool "Paravirtualization support (EXPERIMENTAL)"
+ 	depends on EXPERIMENTAL
+ 	depends on !(X86_VISWS || X86_VOYAGER)
+ 	help
+--- linux-2.6.23.orig/drivers/kvm/kvm.h
++++ linux-2.6.23/drivers/kvm/kvm.h
+@@ -623,10 +623,20 @@ void __kvm_mmu_free_some_pages(struct kv
+ int kvm_mmu_load(struct kvm_vcpu *vcpu);
+ void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+ 
+ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
+ 
++static inline void kvm_guest_enter(void)
++{
++	current->flags |= PF_VCPU;
++}
++
++static inline void kvm_guest_exit(void)
++{
++	current->flags &= ~PF_VCPU;
++}
++
+ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+ 				     u32 error_code)
+ {
+ 	return vcpu->mmu.page_fault(vcpu, gva, error_code);
+ }
+--- linux-2.6.23.orig/fs/pipe.c
++++ linux-2.6.23/fs/pipe.c
+@@ -43,12 +43,11 @@ void pipe_wait(struct pipe_inode_info *p
+ 
+ 	/*
+ 	 * Pipes are system-local resources, so sleeping on them
+ 	 * is considered a noninteractive wait:
+ 	 */
+-	prepare_to_wait(&pipe->wait, &wait,
+-			TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
++	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
+ 	if (pipe->inode)
+ 		mutex_unlock(&pipe->inode->i_mutex);
+ 	schedule();
+ 	finish_wait(&pipe->wait, &wait);
+ 	if (pipe->inode)
+@@ -381,11 +380,11 @@ redo:
+ 	}
+ 	mutex_unlock(&inode->i_mutex);
+ 
+ 	/* Signal writers asynchronously that there is more room. */
+ 	if (do_wakeup) {
+-		wake_up_interruptible(&pipe->wait);
++		wake_up_interruptible_sync(&pipe->wait);
+ 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+ 	}
+ 	if (ret > 0)
+ 		file_accessed(filp);
+ 	return ret;
+@@ -554,11 +553,11 @@ redo2:
+ 		pipe->waiting_writers--;
+ 	}
+ out:
+ 	mutex_unlock(&inode->i_mutex);
+ 	if (do_wakeup) {
+-		wake_up_interruptible(&pipe->wait);
++		wake_up_interruptible_sync(&pipe->wait);
+ 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ 	}
+ 	if (ret > 0)
+ 		file_update_time(filp);
+ 	return ret;
+@@ -648,11 +647,11 @@ pipe_release(struct inode *inode, int de
+ 	pipe->writers -= decw;
+ 
+ 	if (!pipe->readers && !pipe->writers) {
+ 		free_pipe_info(inode);
+ 	} else {
+-		wake_up_interruptible(&pipe->wait);
++		wake_up_interruptible_sync(&pipe->wait);
+ 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+ 	}
+ 	mutex_unlock(&inode->i_mutex);
+ 
+--- linux-2.6.23.orig/fs/proc/array.c
++++ linux-2.6.23/fs/proc/array.c
+@@ -365,15 +365,22 @@ static cputime_t task_stime(struct task_
+ 	 * grows monotonically - apps rely on that):
+ 	 */
+ 	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+ 			cputime_to_clock_t(task_utime(p));
+ 
+-	p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
++	if (stime >= 0)
++		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
++
+ 	return p->prev_stime;
+ }
+ #endif
+ 
++static cputime_t task_gtime(struct task_struct *p)
++{
++	return p->gtime;
++}
++
+ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
+ {
+ 	unsigned long vsize, eip, esp, wchan = ~0UL;
+ 	long priority, nice;
+ 	int tty_pgrp = -1, tty_nr = 0;
+@@ -385,10 +392,11 @@ static int do_task_stat(struct task_stru
+ 	struct mm_struct *mm;
+ 	unsigned long long start_time;
+ 	unsigned long cmin_flt = 0, cmaj_flt = 0;
+ 	unsigned long  min_flt = 0,  maj_flt = 0;
+ 	cputime_t cutime, cstime, utime, stime;
++	cputime_t cgtime, gtime;
+ 	unsigned long rsslim = 0;
+ 	char tcomm[sizeof(task->comm)];
+ 	unsigned long flags;
+ 
+ 	state = *get_task_state(task);
+@@ -403,10 +411,11 @@ static int do_task_stat(struct task_stru
+ 	get_task_comm(tcomm, task);
+ 
+ 	sigemptyset(&sigign);
+ 	sigemptyset(&sigcatch);
+ 	cutime = cstime = utime = stime = cputime_zero;
++	cgtime = gtime = cputime_zero;
+ 
+ 	rcu_read_lock();
+ 	if (lock_task_sighand(task, &flags)) {
+ 		struct signal_struct *sig = task->signal;
+ 
+@@ -420,27 +429,30 @@ static int do_task_stat(struct task_stru
+ 
+ 		cmin_flt = sig->cmin_flt;
+ 		cmaj_flt = sig->cmaj_flt;
+ 		cutime = sig->cutime;
+ 		cstime = sig->cstime;
++		cgtime = sig->cgtime;
+ 		rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
+ 
+ 		/* add up live thread stats at the group level */
+ 		if (whole) {
+ 			struct task_struct *t = task;
+ 			do {
+ 				min_flt += t->min_flt;
+ 				maj_flt += t->maj_flt;
+ 				utime = cputime_add(utime, task_utime(t));
+ 				stime = cputime_add(stime, task_stime(t));
++				gtime = cputime_add(gtime, task_gtime(t));
+ 				t = next_thread(t);
+ 			} while (t != task);
+ 
+ 			min_flt += sig->min_flt;
+ 			maj_flt += sig->maj_flt;
+ 			utime = cputime_add(utime, sig->utime);
+ 			stime = cputime_add(stime, sig->stime);
++			gtime = cputime_add(gtime, sig->gtime);
+ 		}
+ 
+ 		sid = signal_session(sig);
+ 		pgid = process_group(task);
+ 		ppid = rcu_dereference(task->real_parent)->tgid;
+@@ -454,10 +466,11 @@ static int do_task_stat(struct task_stru
+ 	if (!whole) {
+ 		min_flt = task->min_flt;
+ 		maj_flt = task->maj_flt;
+ 		utime = task_utime(task);
+ 		stime = task_stime(task);
++		gtime = task_gtime(task);
+ 	}
+ 
+ 	/* scale priority and nice values from timeslices to -20..20 */
+ 	/* to make it look like a "normal" Unix priority/nice value  */
+ 	priority = task_prio(task);
+@@ -471,11 +484,11 @@ static int do_task_stat(struct task_stru
+ 	/* convert nsec -> ticks */
+ 	start_time = nsec_to_clock_t(start_time);
+ 
+ 	res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
+ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
+-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n",
++%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
+ 		task->pid,
+ 		tcomm,
+ 		state,
+ 		ppid,
+ 		pgid,
+@@ -516,11 +529,13 @@ static int do_task_stat(struct task_stru
+ 		0UL,
+ 		task->exit_signal,
+ 		task_cpu(task),
+ 		task->rt_priority,
+ 		task->policy,
+-		(unsigned long long)delayacct_blkio_ticks(task));
++		(unsigned long long)delayacct_blkio_ticks(task),
++		cputime_to_clock_t(gtime),
++		cputime_to_clock_t(cgtime));
+ 	if (mm)
+ 		mmput(mm);
+ 	return res;
+ }
+ 
+--- linux-2.6.23.orig/fs/proc/base.c
++++ linux-2.6.23/fs/proc/base.c
+@@ -302,11 +302,11 @@ static int proc_pid_wchan(struct task_st
+ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
+ {
+ 	return sprintf(buffer, "%llu %llu %lu\n",
+ 			task->sched_info.cpu_time,
+ 			task->sched_info.run_delay,
+-			task->sched_info.pcnt);
++			task->sched_info.pcount);
+ }
+ #endif
+ 
+ /* The badness from the OOM killer */
+ unsigned long badness(struct task_struct *p, unsigned long uptime);
+--- linux-2.6.23.orig/fs/proc/proc_misc.c
++++ linux-2.6.23/fs/proc/proc_misc.c
+@@ -441,20 +441,22 @@ static const struct file_operations proc
+ static int show_stat(struct seq_file *p, void *v)
+ {
+ 	int i;
+ 	unsigned long jif;
+ 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
++	cputime64_t guest;
+ 	u64 sum = 0;
+ 	struct timespec boottime;
+ 	unsigned int *per_irq_sum;
+ 
+ 	per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL);
+ 	if (!per_irq_sum)
+ 		return -ENOMEM;
+ 
+ 	user = nice = system = idle = iowait =
+ 		irq = softirq = steal = cputime64_zero;
++	guest = cputime64_zero;
+ 	getboottime(&boottime);
+ 	jif = boottime.tv_sec;
+ 
+ 	for_each_possible_cpu(i) {
+ 		int j;
+@@ -465,26 +467,28 @@ static int show_stat(struct seq_file *p,
+ 		idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
+ 		iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
+ 		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
+ 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
+ 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
++		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+ 		for (j = 0; j < NR_IRQS; j++) {
+ 			unsigned int temp = kstat_cpu(i).irqs[j];
+ 			sum += temp;
+ 			per_irq_sum[j] += temp;
+ 		}
+ 	}
+ 
+-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu\n",
++	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ 		(unsigned long long)cputime64_to_clock_t(user),
+ 		(unsigned long long)cputime64_to_clock_t(nice),
+ 		(unsigned long long)cputime64_to_clock_t(system),
+ 		(unsigned long long)cputime64_to_clock_t(idle),
+ 		(unsigned long long)cputime64_to_clock_t(iowait),
+ 		(unsigned long long)cputime64_to_clock_t(irq),
+ 		(unsigned long long)cputime64_to_clock_t(softirq),
+-		(unsigned long long)cputime64_to_clock_t(steal));
++		(unsigned long long)cputime64_to_clock_t(steal),
++		(unsigned long long)cputime64_to_clock_t(guest));
+ 	for_each_online_cpu(i) {
+ 
+ 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
+ 		user = kstat_cpu(i).cpustat.user;
+ 		nice = kstat_cpu(i).cpustat.nice;
+@@ -492,20 +496,23 @@ static int show_stat(struct seq_file *p,
+ 		idle = kstat_cpu(i).cpustat.idle;
+ 		iowait = kstat_cpu(i).cpustat.iowait;
+ 		irq = kstat_cpu(i).cpustat.irq;
+ 		softirq = kstat_cpu(i).cpustat.softirq;
+ 		steal = kstat_cpu(i).cpustat.steal;
+-		seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n",
++		guest = kstat_cpu(i).cpustat.guest;
++		seq_printf(p,
++			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ 			i,
+ 			(unsigned long long)cputime64_to_clock_t(user),
+ 			(unsigned long long)cputime64_to_clock_t(nice),
+ 			(unsigned long long)cputime64_to_clock_t(system),
+ 			(unsigned long long)cputime64_to_clock_t(idle),
+ 			(unsigned long long)cputime64_to_clock_t(iowait),
+ 			(unsigned long long)cputime64_to_clock_t(irq),
+ 			(unsigned long long)cputime64_to_clock_t(softirq),
+-			(unsigned long long)cputime64_to_clock_t(steal));
++			(unsigned long long)cputime64_to_clock_t(steal),
++			(unsigned long long)cputime64_to_clock_t(guest));
+ 	}
+ 	seq_printf(p, "intr %llu", (unsigned long long)sum);
+ 
+ #ifndef CONFIG_SMP
+ 	/* Touches too many cache lines on SMP setups */
+--- /dev/null
++++ linux-2.6.23/include/linux/cgroup.h
+@@ -0,0 +1,12 @@
++#ifndef _LINUX_CGROUP_H
++#define _LINUX_CGROUP_H
++
++/*
++ * Control groups are not backported - we use a few compatibility
++ * defines to be able to use the upstream sched.c as-is:
++ */
++#define task_pid_nr(task)		(task)->pid
++#define task_pid_vnr(task)		(task)->pid
++#define find_task_by_vpid(pid)		find_task_by_pid(pid)
++
++#endif
+--- linux-2.6.23.orig/include/linux/cpuset.h
++++ linux-2.6.23/include/linux/cpuset.h
+@@ -144,8 +144,13 @@ static inline int cpuset_do_slab_mem_spr
+ 	return 0;
+ }
+ 
+ static inline void cpuset_track_online_nodes(void) {}
+ 
++static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p)
++{
++	return cpu_possible_map;
++}
++
+ #endif /* !CONFIG_CPUSETS */
+ 
+ #endif /* _LINUX_CPUSET_H */
+--- linux-2.6.23.orig/include/linux/kernel.h
++++ linux-2.6.23/include/linux/kernel.h
+@@ -59,10 +59,17 @@ extern const char linux_proc_banner[];
+ #define	KERN_WARNING	"<4>"	/* warning conditions			*/
+ #define	KERN_NOTICE	"<5>"	/* normal but significant condition	*/
+ #define	KERN_INFO	"<6>"	/* informational			*/
+ #define	KERN_DEBUG	"<7>"	/* debug-level messages			*/
+ 
++/*
++ * Annotation for a "continued" line of log printout (only done after a
++ * line that had no enclosing \n). Only to be used by core/arch code
++ * during early bootup (a continued line is not SMP-safe otherwise).
++ */
++#define	KERN_CONT	""
++
+ extern int console_printk[];
+ 
+ #define console_loglevel (console_printk[0])
+ #define default_message_loglevel (console_printk[1])
+ #define minimum_console_loglevel (console_printk[2])
+--- linux-2.6.23.orig/include/linux/kernel_stat.h
++++ linux-2.6.23/include/linux/kernel_stat.h
+@@ -21,10 +21,11 @@ struct cpu_usage_stat {
+ 	cputime64_t softirq;
+ 	cputime64_t irq;
+ 	cputime64_t idle;
+ 	cputime64_t iowait;
+ 	cputime64_t steal;
++	cputime64_t guest;
+ };
+ 
+ struct kernel_stat {
+ 	struct cpu_usage_stat	cpustat;
+ 	unsigned int irqs[NR_IRQS];
+@@ -50,9 +51,11 @@ static inline int kstat_irqs(int irq)
+ 
+ 	return sum;
+ }
+ 
+ extern void account_user_time(struct task_struct *, cputime_t);
++extern void account_user_time_scaled(struct task_struct *, cputime_t);
+ extern void account_system_time(struct task_struct *, int, cputime_t);
++extern void account_system_time_scaled(struct task_struct *, cputime_t);
+ extern void account_steal_time(struct task_struct *, cputime_t);
+ 
+ #endif /* _LINUX_KERNEL_STAT_H */
+--- linux-2.6.23.orig/include/linux/nodemask.h
++++ linux-2.6.23/include/linux/nodemask.h
+@@ -336,46 +336,108 @@ static inline void __nodes_remap(nodemas
+ 	if (!nodes_empty(mask))				\
+ 		for ((node) = 0; (node) < 1; (node)++)
+ #endif /* MAX_NUMNODES */
+ 
+ /*
++ * Bitmasks that are kept for all the nodes.
++ */
++enum node_states {
++	N_POSSIBLE,		/* The node could become online at some point */
++	N_ONLINE,		/* The node is online */
++	N_NORMAL_MEMORY,	/* The node has regular memory */
++#ifdef CONFIG_HIGHMEM
++	N_HIGH_MEMORY,		/* The node has regular or high memory */
++#else
++	N_HIGH_MEMORY = N_NORMAL_MEMORY,
++#endif
++	N_CPU,		/* The node has one or more cpus */
++	NR_NODE_STATES
++};
++
++/*
+  * The following particular system nodemasks and operations
+  * on them manage all possible and online nodes.
+  */
+ 
+-extern nodemask_t node_online_map;
+-extern nodemask_t node_possible_map;
++extern nodemask_t node_states[NR_NODE_STATES];
+ 
+ #if MAX_NUMNODES > 1
+-#define num_online_nodes()	nodes_weight(node_online_map)
+-#define num_possible_nodes()	nodes_weight(node_possible_map)
+-#define node_online(node)	node_isset((node), node_online_map)
+-#define node_possible(node)	node_isset((node), node_possible_map)
+-#define first_online_node	first_node(node_online_map)
+-#define next_online_node(nid)	next_node((nid), node_online_map)
++static inline int node_state(int node, enum node_states state)
++{
++	return node_isset(node, node_states[state]);
++}
++
++static inline void node_set_state(int node, enum node_states state)
++{
++	__node_set(node, &node_states[state]);
++}
++
++static inline void node_clear_state(int node, enum node_states state)
++{
++	__node_clear(node, &node_states[state]);
++}
++
++static inline int num_node_state(enum node_states state)
++{
++	return nodes_weight(node_states[state]);
++}
++
++#define for_each_node_state(__node, __state) \
++	for_each_node_mask((__node), node_states[__state])
++
++#define first_online_node	first_node(node_states[N_ONLINE])
++#define next_online_node(nid)	next_node((nid), node_states[N_ONLINE])
++
+ extern int nr_node_ids;
+ #else
+-#define num_online_nodes()	1
+-#define num_possible_nodes()	1
+-#define node_online(node)	((node) == 0)
+-#define node_possible(node)	((node) == 0)
++
++static inline int node_state(int node, enum node_states state)
++{
++	return node == 0;
++}
++
++static inline void node_set_state(int node, enum node_states state)
++{
++}
++
++static inline void node_clear_state(int node, enum node_states state)
++{
++}
++
++static inline int num_node_state(enum node_states state)
++{
++	return 1;
++}
++
++#define for_each_node_state(node, __state) \
++	for ( (node) = 0; (node) == 0; (node) = 1)
++
+ #define first_online_node	0
+ #define next_online_node(nid)	(MAX_NUMNODES)
+ #define nr_node_ids		1
++
+ #endif
+ 
++#define node_online_map 	node_states[N_ONLINE]
++#define node_possible_map 	node_states[N_POSSIBLE]
++
+ #define any_online_node(mask)			\
+ ({						\
+ 	int node;				\
+ 	for_each_node_mask(node, (mask))	\
+ 		if (node_online(node))		\
+ 			break;			\
+ 	node;					\
+ })
+ 
+-#define node_set_online(node)	   set_bit((node), node_online_map.bits)
+-#define node_set_offline(node)	   clear_bit((node), node_online_map.bits)
++#define num_online_nodes()	num_node_state(N_ONLINE)
++#define num_possible_nodes()	num_node_state(N_POSSIBLE)
++#define node_online(node)	node_state((node), N_ONLINE)
++#define node_possible(node)	node_state((node), N_POSSIBLE)
++
++#define node_set_online(node)	   node_set_state((node), N_ONLINE)
++#define node_set_offline(node)	   node_clear_state((node), N_ONLINE)
+ 
+-#define for_each_node(node)	   for_each_node_mask((node), node_possible_map)
+-#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
++#define for_each_node(node)	   for_each_node_state(node, N_POSSIBLE)
++#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
+ 
+ #endif /* __LINUX_NODEMASK_H */
+--- linux-2.6.23.orig/include/linux/sched.h
++++ linux-2.6.23/include/linux/sched.h
+@@ -1,10 +1,21 @@
+ #ifndef _LINUX_SCHED_H
+ #define _LINUX_SCHED_H
+ 
+ #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
+ 
++/* backporting helper macro: */
++#define cpu_sibling_map(cpu) cpu_sibling_map[cpu]
++
++/*
++ *  * Control groups are not backported - we use a few compatibility
++ *   * defines to be able to use the upstream sched.c as-is:
++ *    */
++#define task_pid_nr(task)               (task)->pid
++#define task_pid_vnr(task)              (task)->pid
++#define find_task_by_vpid(pid)          find_task_by_pid(pid)
++
+ /*
+  * cloning flags:
+  */
+ #define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
+ #define CLONE_VM	0x00000100	/* set if VM shared between processes */
+@@ -84,10 +95,11 @@ struct sched_param {
+ #include <linux/param.h>
+ #include <linux/resource.h>
+ #include <linux/timer.h>
+ #include <linux/hrtimer.h>
+ #include <linux/task_io_accounting.h>
++#include <linux/kobject.h>
+ 
+ #include <asm/processor.h>
+ 
+ struct exec_domain;
+ struct futex_pi_state;
+@@ -133,10 +145,11 @@ extern unsigned long nr_active(void);
+ extern unsigned long nr_iowait(void);
+ extern unsigned long weighted_cpuload(const int cpu);
+ 
+ struct seq_file;
+ struct cfs_rq;
++struct task_group;
+ #ifdef CONFIG_SCHED_DEBUG
+ extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
+ extern void proc_sched_set_task(struct task_struct *p);
+ extern void
+ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+@@ -171,12 +184,11 @@ print_cfs_rq(struct seq_file *m, int cpu
+ #define TASK_TRACED		8
+ /* in tsk->exit_state */
+ #define EXIT_ZOMBIE		16
+ #define EXIT_DEAD		32
+ /* in tsk->state again */
+-#define TASK_NONINTERACTIVE	64
+-#define TASK_DEAD		128
++#define TASK_DEAD		64
+ 
+ #define __set_task_state(tsk, state_value)		\
+ 	do { (tsk)->state = (state_value); } while (0)
+ #define set_task_state(tsk, state_value)		\
+ 	set_mb((tsk)->state, (state_value))
+@@ -276,10 +288,14 @@ static inline void touch_all_softlockup_
+ #endif
+ 
+ 
+ /* Attach to any functions which should be ignored in wchan output. */
+ #define __sched		__attribute__((__section__(".sched.text")))
++
++/* Linker adds these: start and end of __sched functions */
++extern char __sched_text_start[], __sched_text_end[];
++
+ /* Is this address in the __sched functions? */
+ extern int in_sched_functions(unsigned long addr);
+ 
+ #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
+ extern signed long FASTCALL(schedule_timeout(signed long timeout));
+@@ -513,10 +529,12 @@ struct signal_struct {
+ 	 * and for reaped dead child processes forked by this group.
+ 	 * Live threads maintain their own counters and add to these
+ 	 * in __exit_signal, except for the group leader.
+ 	 */
+ 	cputime_t utime, stime, cutime, cstime;
++	cputime_t gtime;
++	cputime_t cgtime;
+ 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
+ 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+ 	unsigned long inblock, oublock, cinblock, coublock;
+ 
+ 	/*
+@@ -593,12 +611,27 @@ struct user_struct {
+ #endif
+ 
+ 	/* Hash table maintenance information */
+ 	struct hlist_node uidhash_node;
+ 	uid_t uid;
++
++#ifdef CONFIG_FAIR_USER_SCHED
++	struct task_group *tg;
++#ifdef CONFIG_SYSFS
++	struct kset kset;
++	struct subsys_attribute user_attr;
++	struct work_struct work;
++#endif
++#endif
+ };
+ 
++#ifdef CONFIG_FAIR_USER_SCHED
++extern int uids_kobject_init(void);
++#else
++static inline int uids_kobject_init(void) { return 0; }
++#endif
++
+ extern struct user_struct *find_user(uid_t);
+ 
+ extern struct user_struct root_user;
+ #define INIT_USER (&root_user)
+ 
+@@ -606,17 +639,21 @@ struct backing_dev_info;
+ struct reclaim_state;
+ 
+ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ struct sched_info {
+ 	/* cumulative counters */
+-	unsigned long pcnt;	      /* # of times run on this cpu */
++	unsigned long pcount;	      /* # of times run on this cpu */
+ 	unsigned long long cpu_time,  /* time spent on the cpu */
+ 			   run_delay; /* time spent waiting on a runqueue */
+ 
+ 	/* timestamps */
+ 	unsigned long long last_arrival,/* when we last ran on a cpu */
+ 			   last_queued;	/* when we were last queued to run */
++#ifdef CONFIG_SCHEDSTATS
++	/* BKL stats */
++	unsigned int bkl_count;
++#endif
+ };
+ #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
+ 
+ #ifdef CONFIG_SCHEDSTATS
+ extern const struct file_operations proc_schedstat_operations;
+@@ -747,43 +784,42 @@ struct sched_domain {
+ 	unsigned int balance_interval;	/* initialise to 1. units in ms. */
+ 	unsigned int nr_balance_failed; /* initialise to 0 */
+ 
+ #ifdef CONFIG_SCHEDSTATS
+ 	/* load_balance() stats */
+-	unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_gained[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
+ 
+ 	/* Active load balancing */
+-	unsigned long alb_cnt;
+-	unsigned long alb_failed;
+-	unsigned long alb_pushed;
++	unsigned int alb_count;
++	unsigned int alb_failed;
++	unsigned int alb_pushed;
+ 
+ 	/* SD_BALANCE_EXEC stats */
+-	unsigned long sbe_cnt;
+-	unsigned long sbe_balanced;
+-	unsigned long sbe_pushed;
++	unsigned int sbe_count;
++	unsigned int sbe_balanced;
++	unsigned int sbe_pushed;
+ 
+ 	/* SD_BALANCE_FORK stats */
+-	unsigned long sbf_cnt;
+-	unsigned long sbf_balanced;
+-	unsigned long sbf_pushed;
++	unsigned int sbf_count;
++	unsigned int sbf_balanced;
++	unsigned int sbf_pushed;
+ 
+ 	/* try_to_wake_up() stats */
+-	unsigned long ttwu_wake_remote;
+-	unsigned long ttwu_move_affine;
+-	unsigned long ttwu_move_balance;
++	unsigned int ttwu_wake_remote;
++	unsigned int ttwu_move_affine;
++	unsigned int ttwu_move_balance;
+ #endif
+ };
+ 
+-extern int partition_sched_domains(cpumask_t *partition1,
+-				    cpumask_t *partition2);
++extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
+ 
+ #endif	/* CONFIG_SMP */
+ 
+ /*
+  * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
+@@ -851,27 +887,32 @@ struct uts_namespace;
+ 
+ struct rq;
+ struct sched_domain;
+ 
+ struct sched_class {
+-	struct sched_class *next;
++	const struct sched_class *next;
+ 
+ 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
+ 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
+-	void (*yield_task) (struct rq *rq, struct task_struct *p);
++	void (*yield_task) (struct rq *rq);
+ 
+ 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
+ 
+ 	struct task_struct * (*pick_next_task) (struct rq *rq);
+ 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+ 
++#ifdef CONFIG_SMP
+ 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
+-			struct rq *busiest,
+-			unsigned long max_nr_move, unsigned long max_load_move,
++			struct rq *busiest, unsigned long max_load_move,
+ 			struct sched_domain *sd, enum cpu_idle_type idle,
+ 			int *all_pinned, int *this_best_prio);
+ 
++	int (*move_one_task) (struct rq *this_rq, int this_cpu,
++			      struct rq *busiest, struct sched_domain *sd,
++			      enum cpu_idle_type idle);
++#endif
++
+ 	void (*set_curr_task) (struct rq *rq);
+ 	void (*task_tick) (struct rq *rq, struct task_struct *p);
+ 	void (*task_new) (struct rq *rq, struct task_struct *p);
+ };
+ 
+@@ -885,46 +926,52 @@ struct load_weight {
+  * Current field usage histogram:
+  *
+  *     4 se->block_start
+  *     4 se->run_node
+  *     4 se->sleep_start
+- *     4 se->sleep_start_fair
+  *     6 se->load.weight
+- *     7 se->delta_fair
+- *    15 se->wait_runtime
+  */
+ struct sched_entity {
+-	long			wait_runtime;
+-	unsigned long		delta_fair_run;
+-	unsigned long		delta_fair_sleep;
+-	unsigned long		delta_exec;
+-	s64			fair_key;
+ 	struct load_weight	load;		/* for load-balancing */
+ 	struct rb_node		run_node;
+ 	unsigned int		on_rq;
+ 
+ 	u64			exec_start;
+ 	u64			sum_exec_runtime;
++	u64			vruntime;
+ 	u64			prev_sum_exec_runtime;
+-	u64			wait_start_fair;
+-	u64			sleep_start_fair;
+ 
+ #ifdef CONFIG_SCHEDSTATS
+ 	u64			wait_start;
+ 	u64			wait_max;
+-	s64			sum_wait_runtime;
+ 
+ 	u64			sleep_start;
+ 	u64			sleep_max;
+ 	s64			sum_sleep_runtime;
+ 
+ 	u64			block_start;
+ 	u64			block_max;
+ 	u64			exec_max;
++	u64			slice_max;
+ 
+-	unsigned long		wait_runtime_overruns;
+-	unsigned long		wait_runtime_underruns;
++	u64			nr_migrations;
++	u64			nr_migrations_cold;
++	u64			nr_failed_migrations_affine;
++	u64			nr_failed_migrations_running;
++	u64			nr_failed_migrations_hot;
++	u64			nr_forced_migrations;
++	u64			nr_forced2_migrations;
++
++	u64			nr_wakeups;
++	u64			nr_wakeups_sync;
++	u64			nr_wakeups_migrate;
++	u64			nr_wakeups_local;
++	u64			nr_wakeups_remote;
++	u64			nr_wakeups_affine;
++	u64			nr_wakeups_affine_attempts;
++	u64			nr_wakeups_passive;
++	u64			nr_wakeups_idle;
+ #endif
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	struct sched_entity	*parent;
+ 	/* rq on which this entity is (to be) queued: */
+@@ -949,11 +996,11 @@ struct task_struct {
+ #endif
+ #endif
+ 
+ 	int prio, static_prio, normal_prio;
+ 	struct list_head run_list;
+-	struct sched_class *sched_class;
++	const struct sched_class *sched_class;
+ 	struct sched_entity se;
+ 
+ #ifdef CONFIG_PREEMPT_NOTIFIERS
+ 	/* list of struct preempt_notifier: */
+ 	struct hlist_head preempt_notifiers;
+@@ -1019,11 +1066,12 @@ struct task_struct {
+ 	struct completion *vfork_done;		/* for vfork() */
+ 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
+ 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
+ 
+ 	unsigned int rt_priority;
+-	cputime_t utime, stime;
++	cputime_t utime, stime, utimescaled, stimescaled;
++	cputime_t gtime;
+ 	cputime_t prev_utime, prev_stime;
+ 	unsigned long nvcsw, nivcsw; /* context switch counts */
+ 	struct timespec start_time; 		/* monotonic time */
+ 	struct timespec real_start_time;	/* boot based time */
+ /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
+@@ -1312,10 +1360,11 @@ static inline void put_task_struct(struc
+ #define PF_ALIGNWARN	0x00000001	/* Print alignment warning msgs */
+ 					/* Not implemented yet, only for 486*/
+ #define PF_STARTING	0x00000002	/* being created */
+ #define PF_EXITING	0x00000004	/* getting shut down */
+ #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
++#define PF_VCPU		0x00000010	/* I'm a virtual CPU */
+ #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
+ #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
+ #define PF_DUMPCORE	0x00000200	/* dumped core */
+ #define PF_SIGNALED	0x00000400	/* killed by a signal */
+ #define PF_MEMALLOC	0x00000800	/* Allocating memory */
+@@ -1399,19 +1448,30 @@ extern void idle_task_exit(void);
+ static inline void idle_task_exit(void) {}
+ #endif
+ 
+ extern void sched_idle_next(void);
+ 
++#ifdef CONFIG_SCHED_DEBUG
+ extern unsigned int sysctl_sched_latency;
+ extern unsigned int sysctl_sched_min_granularity;
+ extern unsigned int sysctl_sched_wakeup_granularity;
+ extern unsigned int sysctl_sched_batch_wakeup_granularity;
+-extern unsigned int sysctl_sched_stat_granularity;
+-extern unsigned int sysctl_sched_runtime_limit;
+-extern unsigned int sysctl_sched_compat_yield;
+ extern unsigned int sysctl_sched_child_runs_first;
+ extern unsigned int sysctl_sched_features;
++extern unsigned int sysctl_sched_migration_cost;
++extern unsigned int sysctl_sched_nr_migrate;
++#ifdef CONFIG_FAIR_GROUP_SCHED
++extern unsigned int sysctl_sched_min_bal_int_shares;
++extern unsigned int sysctl_sched_max_bal_int_shares;
++#endif
++
++int sched_nr_latency_handler(struct ctl_table *table, int write,
++		struct file *file, void __user *buffer, size_t *length,
++		loff_t *ppos);
++#endif
++
++extern unsigned int sysctl_sched_compat_yield;
+ 
+ #ifdef CONFIG_RT_MUTEXES
+ extern int rt_mutex_getprio(struct task_struct *p);
+ extern void rt_mutex_setprio(struct task_struct *p, int prio);
+ extern void rt_mutex_adjust_pi(struct task_struct *p);
+@@ -1841,10 +1901,22 @@ extern long sched_getaffinity(pid_t pid,
+ 
+ extern int sched_mc_power_savings, sched_smt_power_savings;
+ 
+ extern void normalize_rt_tasks(void);
+ 
++#ifdef CONFIG_FAIR_GROUP_SCHED
++
++extern struct task_group init_task_group;
++
++extern struct task_group *sched_create_group(void);
++extern void sched_destroy_group(struct task_group *tg);
++extern void sched_move_task(struct task_struct *tsk);
++extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
++extern unsigned long sched_group_shares(struct task_group *tg);
++
++#endif
++
+ #ifdef CONFIG_TASK_XACCT
+ static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
+ {
+ 	tsk->rchar += amt;
+ }
+@@ -1879,8 +1951,16 @@ static inline void inc_syscr(struct task
+ static inline void inc_syscw(struct task_struct *tsk)
+ {
+ }
+ #endif
+ 
++#ifdef CONFIG_SMP
++void migration_init(void);
++#else
++static inline void migration_init(void)
++{
++}
++#endif
++
+ #endif /* __KERNEL__ */
+ 
+ #endif
+--- linux-2.6.23.orig/include/linux/taskstats.h
++++ linux-2.6.23/include/linux/taskstats.h
+@@ -29,11 +29,11 @@
+  *	b) add comment indicating new version number at end of struct
+  *	c) add new fields after version comment; maintain 64-bit alignment
+  */
+ 
+ 
+-#define TASKSTATS_VERSION	5
++#define TASKSTATS_VERSION	6
+ #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
+ 					 * in linux/sched.h */
+ 
+ struct taskstats {
+ 
+@@ -150,10 +150,15 @@ struct taskstats {
+ 	__u64	write_bytes;		/* bytes of write I/O */
+ 	__u64	cancelled_write_bytes;	/* bytes of cancelled write I/O */
+ 
+ 	__u64  nvcsw;			/* voluntary_ctxt_switches */
+ 	__u64  nivcsw;			/* nonvoluntary_ctxt_switches */
++
++	/* time accounting for SMT machines */
++	__u64	ac_utimescaled;		/* utime scaled on frequency etc */
++	__u64	ac_stimescaled;		/* stime scaled on frequency etc */
++	__u64	cpu_scaled_run_real_total; /* scaled cpu_run_real_total */
+ };
+ 
+ 
+ /*
+  * Commands sent from userspace
+--- linux-2.6.23.orig/include/linux/topology.h
++++ linux-2.6.23/include/linux/topology.h
+@@ -157,19 +157,18 @@
+ 	.max_interval		= 4,			\
+ 	.busy_factor		= 64,			\
+ 	.imbalance_pct		= 125,			\
+ 	.cache_nice_tries	= 1,			\
+ 	.busy_idx		= 2,			\
+-	.idle_idx		= 0,			\
+-	.newidle_idx		= 0,			\
++	.idle_idx		= 1,			\
++	.newidle_idx		= 2,			\
+ 	.wake_idx		= 1,			\
+ 	.forkexec_idx		= 1,			\
+ 	.flags			= SD_LOAD_BALANCE	\
+ 				| SD_BALANCE_NEWIDLE	\
+ 				| SD_BALANCE_EXEC	\
+ 				| SD_WAKE_AFFINE	\
+-				| SD_WAKE_IDLE		\
+ 				| BALANCE_FOR_PKG_POWER,\
+ 	.last_balance		= jiffies,		\
+ 	.balance_interval	= 1,			\
+ 	.nr_balance_failed	= 0,			\
+ }
+--- linux-2.6.23.orig/init/Kconfig
++++ linux-2.6.23/init/Kconfig
+@@ -271,18 +271,44 @@ config LOG_BUF_SHIFT
+ 		     12 =>  4 KB
+ 
+ config CPUSETS
+ 	bool "Cpuset support"
+ 	depends on SMP
++	#
++	# disabled for now - depends on control groups, which
++	# are hard to backport:
++	#
++	depends on 0
+ 	help
+ 	  This option will let you create and manage CPUSETs which
+ 	  allow dynamically partitioning a system into sets of CPUs and
+ 	  Memory Nodes and assigning tasks to run only within those sets.
+ 	  This is primarily useful on large SMP or NUMA systems.
+ 
+ 	  Say N if unsure.
+ 
++config FAIR_GROUP_SCHED
++	bool "Fair group CPU scheduler"
++	default y
++	depends on EXPERIMENTAL
++	help
++	  This feature lets CPU scheduler recognize task groups and control CPU
++	  bandwidth allocation to such task groups.
++
++choice
++	depends on FAIR_GROUP_SCHED
++	prompt "Basis for grouping tasks"
++	default FAIR_USER_SCHED
++
++config FAIR_USER_SCHED
++	bool "user id"
++	help
++	  This option will choose userid as the basis for grouping
++	  tasks, thus providing equal CPU bandwidth to each user.
++
++endchoice
++
+ config SYSFS_DEPRECATED
+ 	bool "Create deprecated sysfs files"
+ 	default y
+ 	help
+ 	  This option creates deprecated symlinks such as the
+--- linux-2.6.23.orig/init/main.c
++++ linux-2.6.23/init/main.c
+@@ -750,15 +750,12 @@ static int __init nosoftlockup_setup(cha
+ __setup("nosoftlockup", nosoftlockup_setup);
+ 
+ static void __init do_pre_smp_initcalls(void)
+ {
+ 	extern int spawn_ksoftirqd(void);
+-#ifdef CONFIG_SMP
+-	extern int migration_init(void);
+ 
+ 	migration_init();
+-#endif
+ 	spawn_ksoftirqd();
+ 	if (!nosoftlockup)
+ 		spawn_softlockup_task();
+ }
+ 
+--- linux-2.6.23.orig/kernel/delayacct.c
++++ linux-2.6.23/kernel/delayacct.c
+@@ -113,15 +113,21 @@ int __delayacct_add_tsk(struct taskstats
+ 	tmp = (s64)d->cpu_run_real_total;
+ 	cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+ 	tmp += timespec_to_ns(&ts);
+ 	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+ 
++	tmp = (s64)d->cpu_scaled_run_real_total;
++	cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
++	tmp += timespec_to_ns(&ts);
++	d->cpu_scaled_run_real_total =
++		(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
++
+ 	/*
+ 	 * No locking available for sched_info (and too expensive to add one)
+ 	 * Mitigate by taking snapshot of values
+ 	 */
+-	t1 = tsk->sched_info.pcnt;
++	t1 = tsk->sched_info.pcount;
+ 	t2 = tsk->sched_info.run_delay;
+ 	t3 = tsk->sched_info.cpu_time;
+ 
+ 	d->cpu_count += t1;
+ 
+--- linux-2.6.23.orig/kernel/exit.c
++++ linux-2.6.23/kernel/exit.c
+@@ -109,10 +109,11 @@ static void __exit_signal(struct task_st
+ 		 * We won't ever get here for the group leader, since it
+ 		 * will have been the last reference on the signal_struct.
+ 		 */
+ 		sig->utime = cputime_add(sig->utime, tsk->utime);
+ 		sig->stime = cputime_add(sig->stime, tsk->stime);
++		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
+ 		sig->min_flt += tsk->min_flt;
+ 		sig->maj_flt += tsk->maj_flt;
+ 		sig->nvcsw += tsk->nvcsw;
+ 		sig->nivcsw += tsk->nivcsw;
+ 		sig->inblock += task_io_get_inblock(tsk);
+@@ -1240,10 +1241,15 @@ static int wait_task_zombie(struct task_
+ 		psig->cstime =
+ 			cputime_add(psig->cstime,
+ 			cputime_add(p->stime,
+ 			cputime_add(sig->stime,
+ 				    sig->cstime)));
++		psig->cgtime =
++			cputime_add(psig->cgtime,
++			cputime_add(p->gtime,
++			cputime_add(sig->gtime,
++				    sig->cgtime)));
+ 		psig->cmin_flt +=
+ 			p->min_flt + sig->min_flt + sig->cmin_flt;
+ 		psig->cmaj_flt +=
+ 			p->maj_flt + sig->maj_flt + sig->cmaj_flt;
+ 		psig->cnvcsw +=
+--- linux-2.6.23.orig/kernel/fork.c
++++ linux-2.6.23/kernel/fork.c
+@@ -875,10 +875,12 @@ static inline int copy_signal(unsigned l
+ 
+ 	sig->leader = 0;	/* session leadership doesn't inherit */
+ 	sig->tty_old_pgrp = NULL;
+ 
+ 	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
++	sig->gtime = cputime_zero;
++	sig->cgtime = cputime_zero;
+ 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
+ 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+ 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+ 	sig->sum_sched_runtime = 0;
+ 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
+@@ -1045,10 +1047,13 @@ static struct task_struct *copy_process(
+ 
+ 	p->utime = cputime_zero;
+ 	p->stime = cputime_zero;
+ 	p->prev_utime = cputime_zero;
+ 	p->prev_stime = cputime_zero;
++	p->gtime = cputime_zero;
++	p->utimescaled = cputime_zero;
++	p->stimescaled = cputime_zero;
+ 
+ #ifdef CONFIG_TASK_XACCT
+ 	p->rchar = 0;		/* I/O counter: bytes read */
+ 	p->wchar = 0;		/* I/O counter: bytes written */
+ 	p->syscr = 0;		/* I/O counter: read syscalls */
+--- linux-2.6.23.orig/kernel/ksysfs.c
++++ linux-2.6.23/kernel/ksysfs.c
+@@ -12,10 +12,11 @@
+ #include <linux/string.h>
+ #include <linux/sysfs.h>
+ #include <linux/module.h>
+ #include <linux/init.h>
+ #include <linux/kexec.h>
++#include <linux/sched.h>
+ 
+ #define KERNEL_ATTR_RO(_name) \
+ static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
+ 
+ #define KERNEL_ATTR_RW(_name) \
+@@ -114,9 +115,16 @@ static int __init ksysfs_init(void)
+ 		notes_attr.size = notes_size;
+ 		error = sysfs_create_bin_file(&kernel_subsys.kobj,
+ 					      &notes_attr);
+ 	}
+ 
++	/*
++	 * Create "/sys/kernel/uids" directory and corresponding root user's
++	 * directory under it.
++	 */
++	if (!error)
++		error = uids_kobject_init();
++
+ 	return error;
+ }
+ 
+ core_initcall(ksysfs_init);
+--- linux-2.6.23.orig/kernel/sched.c
++++ linux-2.6.23/kernel/sched.c
+@@ -42,10 +42,11 @@
+ #include <linux/profile.h>
+ #include <linux/freezer.h>
+ #include <linux/vmalloc.h>
+ #include <linux/blkdev.h>
+ #include <linux/delay.h>
++#include <linux/pid_namespace.h>
+ #include <linux/smp.h>
+ #include <linux/threads.h>
+ #include <linux/timer.h>
+ #include <linux/rcupdate.h>
+ #include <linux/cpu.h>
+@@ -59,21 +60,23 @@
+ #include <linux/tsacct_kern.h>
+ #include <linux/kprobes.h>
+ #include <linux/delayacct.h>
+ #include <linux/reciprocal_div.h>
+ #include <linux/unistd.h>
++#include <linux/pagemap.h>
+ 
+ #include <asm/tlb.h>
++#include <asm/irq_regs.h>
+ 
+ /*
+  * Scheduler clock - returns current time in nanosec units.
+  * This is default implementation.
+  * Architectures and sub-architectures can override this.
+  */
+ unsigned long long __attribute__((weak)) sched_clock(void)
+ {
+-	return (unsigned long long)jiffies * (1000000000 / HZ);
++	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+ }
+ 
+ /*
+  * Convert user-nice values [ -20 ... 0 ... 19 ]
+  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+@@ -93,24 +96,22 @@ unsigned long long __attribute__((weak))
+ #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+ 
+ /*
+  * Some helpers for converting nanosecond timing to jiffy resolution
+  */
+-#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
+-#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
++#define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
++#define JIFFIES_TO_NS(TIME)	((TIME) * (NSEC_PER_SEC / HZ))
+ 
+ #define NICE_0_LOAD		SCHED_LOAD_SCALE
+ #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
+ 
+ /*
+  * These are the 'tuning knobs' of the scheduler:
+  *
+- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
+- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
++ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
+  * Timeslices get refilled after they expire.
+  */
+-#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
+ #define DEF_TIMESLICE		(100 * HZ / 1000)
+ 
+ #ifdef CONFIG_SMP
+ /*
+  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+@@ -130,28 +131,10 @@ static inline void sg_inc_cpu_power(stru
+ 	sg->__cpu_power += val;
+ 	sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+ }
+ #endif
+ 
+-#define SCALE_PRIO(x, prio) \
+-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
+-
+-/*
+- * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+- * to time slice values: [800ms ... 100ms ... 5ms]
+- */
+-static unsigned int static_prio_timeslice(int static_prio)
+-{
+-	if (static_prio == NICE_TO_PRIO(19))
+-		return 1;
+-
+-	if (static_prio < NICE_TO_PRIO(0))
+-		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
+-	else
+-		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
+-}
+-
+ static inline int rt_policy(int policy)
+ {
+ 	if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
+ 		return 1;
+ 	return 0;
+@@ -168,45 +151,115 @@ static inline int task_has_rt_policy(str
+ struct rt_prio_array {
+ 	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+ 	struct list_head queue[MAX_RT_PRIO];
+ };
+ 
+-struct load_stat {
+-	struct load_weight load;
+-	u64 load_update_start, load_update_last;
+-	unsigned long delta_fair, delta_exec, delta_stat;
++#ifdef CONFIG_FAIR_GROUP_SCHED
++
++#include <linux/cgroup.h>
++
++struct cfs_rq;
++
++/* task group related information */
++struct task_group {
++#ifdef CONFIG_FAIR_CGROUP_SCHED
++	struct cgroup_subsys_state css;
++#endif
++	/* schedulable entities of this group on each cpu */
++	struct sched_entity **se;
++	/* runqueue "owned" by this group on each cpu */
++	struct cfs_rq **cfs_rq;
++	unsigned long shares;
++	/* spinlock to serialize modification to shares */
++	spinlock_t lock;
++	struct rcu_head rcu;
++};
++
++/* Default task group's sched entity on each cpu */
++static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
++/* Default task group's cfs_rq on each cpu */
++static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
++
++static struct sched_entity *init_sched_entity_p[NR_CPUS];
++static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
++
++/* Default task group.
++ *	Every task in system belong to this group at bootup.
++ */
++struct task_group init_task_group = {
++	.se     = init_sched_entity_p,
++	.cfs_rq = init_cfs_rq_p,
+ };
+ 
++#ifdef CONFIG_FAIR_USER_SCHED
++# define INIT_TASK_GRP_LOAD	2*NICE_0_LOAD
++#else
++# define INIT_TASK_GRP_LOAD	NICE_0_LOAD
++#endif
++
++static int init_task_group_load = INIT_TASK_GRP_LOAD;
++
++/* return group to which a task belongs */
++static inline struct task_group *task_group(struct task_struct *p)
++{
++	struct task_group *tg;
++
++#ifdef CONFIG_FAIR_USER_SCHED
++	tg = p->user->tg;
++#elif defined(CONFIG_FAIR_CGROUP_SCHED)
++	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
++				struct task_group, css);
++#else
++	tg = &init_task_group;
++#endif
++	return tg;
++}
++
++/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
++static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu)
++{
++	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
++	p->se.parent = task_group(p)->se[cpu];
++}
++
++#else
++
++static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }
++
++#endif	/* CONFIG_FAIR_GROUP_SCHED */
++
+ /* CFS-related fields in a runqueue */
+ struct cfs_rq {
+ 	struct load_weight load;
+ 	unsigned long nr_running;
+ 
+-	s64 fair_clock;
+ 	u64 exec_clock;
+-	s64 wait_runtime;
+-	u64 sleeper_bonus;
+-	unsigned long wait_runtime_overruns, wait_runtime_underruns;
++	u64 min_vruntime;
+ 
+ 	struct rb_root tasks_timeline;
+ 	struct rb_node *rb_leftmost;
+ 	struct rb_node *rb_load_balance_curr;
+-#ifdef CONFIG_FAIR_GROUP_SCHED
+ 	/* 'curr' points to currently running entity on this cfs_rq.
+ 	 * It is set to NULL otherwise (i.e when none are currently running).
+ 	 */
+ 	struct sched_entity *curr;
++
++	unsigned long nr_spread_over;
++
++#ifdef CONFIG_FAIR_GROUP_SCHED
+ 	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
+ 
+-	/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
++	/*
++	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ 	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ 	 * (like users, containers etc.)
+ 	 *
+ 	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ 	 * list is used during load balance.
+ 	 */
+-	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
++	struct list_head leaf_cfs_rq_list;
++	struct task_group *tg;	/* group that "owns" this runqueue */
+ #endif
+ };
+ 
+ /* Real-Time classes' related field in a runqueue: */
+ struct rt_rq {
+@@ -221,11 +274,12 @@ struct rt_rq {
+  * Locking rule: those places that want to lock multiple runqueues
+  * (such as the load balancing or the thread migration code), lock
+  * acquire operations must be ordered by ascending &runqueue.
+  */
+ struct rq {
+-	spinlock_t lock;	/* runqueue lock */
++	/* runqueue lock: */
++	spinlock_t lock;
+ 
+ 	/*
+ 	 * nr_running and cpu_load should be in the same cacheline because
+ 	 * remote CPUs use both these fields when doing load calculation.
+ 	 */
+@@ -234,19 +288,21 @@ struct rq {
+ 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ 	unsigned char idle_at_tick;
+ #ifdef CONFIG_NO_HZ
+ 	unsigned char in_nohz_recently;
+ #endif
+-	struct load_stat ls;	/* capture load from *all* tasks on this cpu */
++	/* capture load from *all* tasks on this cpu: */
++	struct load_weight load;
+ 	unsigned long nr_load_updates;
+ 	u64 nr_switches;
+ 
+ 	struct cfs_rq cfs;
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+-	struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
++	/* list of leaf cfs_rq on this cpu: */
++	struct list_head leaf_cfs_rq_list;
+ #endif
+-	struct rt_rq  rt;
++	struct rt_rq rt;
+ 
+ 	/*
+ 	 * This is part of a global counter where only the total sum
+ 	 * over all CPUs matters. A task can increase this counter on
+ 	 * one CPU and if it got migrated afterwards it may decrease
+@@ -272,34 +328,38 @@ struct rq {
+ 	struct sched_domain *sd;
+ 
+ 	/* For active balancing */
+ 	int active_balance;
+ 	int push_cpu;
+-	int cpu;		/* cpu of this runqueue */
++	/* cpu of this runqueue: */
++	int cpu;
+ 
+ 	struct task_struct *migration_thread;
+ 	struct list_head migration_queue;
+ #endif
+ 
+ #ifdef CONFIG_SCHEDSTATS
+ 	/* latency stats */
+ 	struct sched_info rq_sched_info;
+ 
+ 	/* sys_sched_yield() stats */
+-	unsigned long yld_exp_empty;
+-	unsigned long yld_act_empty;
+-	unsigned long yld_both_empty;
+-	unsigned long yld_cnt;
++	unsigned int yld_exp_empty;
++	unsigned int yld_act_empty;
++	unsigned int yld_both_empty;
++	unsigned int yld_count;
+ 
+ 	/* schedule() stats */
+-	unsigned long sched_switch;
+-	unsigned long sched_cnt;
+-	unsigned long sched_goidle;
++	unsigned int sched_switch;
++	unsigned int sched_count;
++	unsigned int sched_goidle;
+ 
+ 	/* try_to_wake_up() stats */
+-	unsigned long ttwu_cnt;
+-	unsigned long ttwu_local;
++	unsigned int ttwu_count;
++	unsigned int ttwu_local;
++
++	/* BKL stats */
++	unsigned int bkl_count;
+ #endif
+ 	struct lock_class_key rq_lock_key;
+ };
+ 
+ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+@@ -380,10 +440,45 @@ static void update_rq_clock(struct rq *r
+ #define this_rq()		(&__get_cpu_var(runqueues))
+ #define task_rq(p)		cpu_rq(task_cpu(p))
+ #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
+ 
+ /*
++ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
++ */
++#ifdef CONFIG_SCHED_DEBUG
++# define const_debug __read_mostly
++#else
++# define const_debug static const
++#endif
++
++/*
++ * Debugging: various feature bits
++ */
++enum {
++	SCHED_FEAT_NEW_FAIR_SLEEPERS	= 1,
++	SCHED_FEAT_WAKEUP_PREEMPT	= 2,
++	SCHED_FEAT_START_DEBIT		= 4,
++	SCHED_FEAT_TREE_AVG		= 8,
++	SCHED_FEAT_APPROX_AVG		= 16,
++};
++
++const_debug unsigned int sysctl_sched_features =
++		SCHED_FEAT_NEW_FAIR_SLEEPERS	* 1 |
++		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
++		SCHED_FEAT_START_DEBIT		* 1 |
++		SCHED_FEAT_TREE_AVG		* 0 |
++		SCHED_FEAT_APPROX_AVG		* 0;
++
++#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
++
++/*
++ * Number of tasks to iterate in a single balance run.
++ * Limited because this is done with IRQs disabled.
++ */
++const_debug unsigned int sysctl_sched_nr_migrate = 32;
++
++/*
+  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+  * clock constructed from sched_clock():
+  */
+ unsigned long long cpu_clock(int cpu)
+ {
+@@ -391,40 +486,39 @@ unsigned long long cpu_clock(int cpu)
+ 	unsigned long flags;
+ 	struct rq *rq;
+ 
+ 	local_irq_save(flags);
+ 	rq = cpu_rq(cpu);
+-	update_rq_clock(rq);
++	/*
++	 * Only call sched_clock() if the scheduler has already been
++	 * initialized (some code might call cpu_clock() very early):
++	 */
++	if (rq->idle)
++		update_rq_clock(rq);
+ 	now = rq->clock;
+ 	local_irq_restore(flags);
+ 
+ 	return now;
+ }
+-
+-#ifdef CONFIG_FAIR_GROUP_SCHED
+-/* Change a task's ->cfs_rq if it moves across CPUs */
+-static inline void set_task_cfs_rq(struct task_struct *p)
+-{
+-	p->se.cfs_rq = &task_rq(p)->cfs;
+-}
+-#else
+-static inline void set_task_cfs_rq(struct task_struct *p)
+-{
+-}
+-#endif
++EXPORT_SYMBOL_GPL(cpu_clock);
+ 
+ #ifndef prepare_arch_switch
+ # define prepare_arch_switch(next)	do { } while (0)
+ #endif
+ #ifndef finish_arch_switch
+ # define finish_arch_switch(prev)	do { } while (0)
+ #endif
+ 
++static inline int task_current(struct rq *rq, struct task_struct *p)
++{
++	return rq->curr == p;
++}
++
+ #ifndef __ARCH_WANT_UNLOCKED_CTXSW
+ static inline int task_running(struct rq *rq, struct task_struct *p)
+ {
+-	return rq->curr == p;
++	return task_current(rq, p);
+ }
+ 
+ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+ {
+ }
+@@ -449,11 +543,11 @@ static inline void finish_lock_switch(st
+ static inline int task_running(struct rq *rq, struct task_struct *p)
+ {
+ #ifdef CONFIG_SMP
+ 	return p->oncpu;
+ #else
+-	return rq->curr == p;
++	return task_current(rq, p);
+ #endif
+ }
+ 
+ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+ {
+@@ -494,44 +588,40 @@ static inline void finish_lock_switch(st
+  * Must be called interrupts disabled.
+  */
+ static inline struct rq *__task_rq_lock(struct task_struct *p)
+ 	__acquires(rq->lock)
+ {
+-	struct rq *rq;
+-
+-repeat_lock_task:
+-	rq = task_rq(p);
+-	spin_lock(&rq->lock);
+-	if (unlikely(rq != task_rq(p))) {
++	for (;;) {
++		struct rq *rq = task_rq(p);
++		spin_lock(&rq->lock);
++		if (likely(rq == task_rq(p)))
++			return rq;
+ 		spin_unlock(&rq->lock);
+-		goto repeat_lock_task;
+ 	}
+-	return rq;
+ }
+ 
+ /*
+  * task_rq_lock - lock the runqueue a given task resides on and disable
+- * interrupts.  Note the ordering: we can safely lookup the task_rq without
++ * interrupts. Note the ordering: we can safely lookup the task_rq without
+  * explicitly disabling preemption.
+  */
+ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ 	__acquires(rq->lock)
+ {
+ 	struct rq *rq;
+ 
+-repeat_lock_task:
+-	local_irq_save(*flags);
+-	rq = task_rq(p);
+-	spin_lock(&rq->lock);
+-	if (unlikely(rq != task_rq(p))) {
++	for (;;) {
++		local_irq_save(*flags);
++		rq = task_rq(p);
++		spin_lock(&rq->lock);
++		if (likely(rq == task_rq(p)))
++			return rq;
+ 		spin_unlock_irqrestore(&rq->lock, *flags);
+-		goto repeat_lock_task;
+ 	}
+-	return rq;
+ }
+ 
+-static inline void __task_rq_unlock(struct rq *rq)
++static void __task_rq_unlock(struct rq *rq)
+ 	__releases(rq->lock)
+ {
+ 	spin_unlock(&rq->lock);
+ }
+ 
+@@ -542,11 +632,11 @@ static inline void task_rq_unlock(struct
+ }
+ 
+ /*
+  * this_rq_lock - lock this runqueue and disable interrupts.
+  */
+-static inline struct rq *this_rq_lock(void)
++static struct rq *this_rq_lock(void)
+ 	__acquires(rq->lock)
+ {
+ 	struct rq *rq;
+ 
+ 	local_irq_disable();
+@@ -576,10 +666,11 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep
+ void sched_clock_idle_wakeup_event(u64 delta_ns)
+ {
+ 	struct rq *rq = cpu_rq(smp_processor_id());
+ 	u64 now = sched_clock();
+ 
++	touch_softlockup_watchdog();
+ 	rq->idle_clock += delta_ns;
+ 	/*
+ 	 * Override the previous timestamp and ignore all
+ 	 * sched_clock() deltas that occured while we idled,
+ 	 * and use the PM-provided delta_ns to advance the
+@@ -642,23 +733,10 @@ static inline void resched_task(struct t
+ 	assert_spin_locked(&task_rq(p)->lock);
+ 	set_tsk_need_resched(p);
+ }
+ #endif
+ 
+-static u64 div64_likely32(u64 divident, unsigned long divisor)
+-{
+-#if BITS_PER_LONG == 32
+-	if (likely(divident <= 0xffffffffULL))
+-		return (u32)divident / divisor;
+-	do_div(divident, divisor);
+-
+-	return divident;
+-#else
+-	return divident / divisor;
+-#endif
+-}
+-
+ #if BITS_PER_LONG == 32
+ # define WMULT_CONST	(~0UL)
+ #else
+ # define WMULT_CONST	(1UL << 32)
+ #endif
+@@ -696,27 +774,25 @@ static inline unsigned long
+ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
+ {
+ 	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
+ }
+ 
+-static void update_load_add(struct load_weight *lw, unsigned long inc)
++static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+ {
+ 	lw->weight += inc;
+-	lw->inv_weight = 0;
+ }
+ 
+-static void update_load_sub(struct load_weight *lw, unsigned long dec)
++static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+ {
+ 	lw->weight -= dec;
+-	lw->inv_weight = 0;
+ }
+ 
+ /*
+  * To aid in avoiding the subversion of "niceness" due to uneven distribution
+  * of tasks with abnormal "nice" values across CPUs the contribution that
+  * each task makes to its run queue's load is weighted according to its
+- * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
++ * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
+  * scaled version of the new time slice allocation that they receive on time
+  * slice expiry etc.
+  */
+ 
+ #define WEIGHT_IDLEPRIO		2
+@@ -774,76 +850,62 @@ struct rq_iterator {
+ 	void *arg;
+ 	struct task_struct *(*start)(void *);
+ 	struct task_struct *(*next)(void *);
+ };
+ 
+-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+-		      unsigned long max_nr_move, unsigned long max_load_move,
+-		      struct sched_domain *sd, enum cpu_idle_type idle,
+-		      int *all_pinned, unsigned long *load_moved,
+-		      int *this_best_prio, struct rq_iterator *iterator);
++#ifdef CONFIG_SMP
++static unsigned long
++balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
++	      unsigned long max_load_move, struct sched_domain *sd,
++	      enum cpu_idle_type idle, int *all_pinned,
++	      int *this_best_prio, struct rq_iterator *iterator);
++
++static int
++iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
++		   struct sched_domain *sd, enum cpu_idle_type idle,
++		   struct rq_iterator *iterator);
++#endif
++
++#ifdef CONFIG_CGROUP_CPUACCT
++static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
++#else
++static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
++#endif
+ 
+ #include "sched_stats.h"
+-#include "sched_rt.c"
+-#include "sched_fair.c"
+ #include "sched_idletask.c"
++#include "sched_fair.c"
++#include "sched_rt.c"
+ #ifdef CONFIG_SCHED_DEBUG
+ # include "sched_debug.c"
+ #endif
+ 
+ #define sched_class_highest (&rt_sched_class)
+ 
+-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
+-{
+-	if (rq->curr != rq->idle && ls->load.weight) {
+-		ls->delta_exec += ls->delta_stat;
+-		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
+-		ls->delta_stat = 0;
+-	}
+-}
+-
+ /*
+  * Update delta_exec, delta_fair fields for rq.
+  *
+  * delta_fair clock advances at a rate inversely proportional to
+- * total load (rq->ls.load.weight) on the runqueue, while
++ * total load (rq->load.weight) on the runqueue, while
+  * delta_exec advances at the same rate as wall-clock (provided
+  * cpu is not idle).
+  *
+  * delta_exec / delta_fair is a measure of the (smoothened) load on this
+  * runqueue over any given interval. This (smoothened) load is used
+  * during load balance.
+  *
+- * This function is called /before/ updating rq->ls.load
++ * This function is called /before/ updating rq->load
+  * and when switching tasks.
+  */
+-static void update_curr_load(struct rq *rq)
+-{
+-	struct load_stat *ls = &rq->ls;
+-	u64 start;
+-
+-	start = ls->load_update_start;
+-	ls->load_update_start = rq->clock;
+-	ls->delta_stat += rq->clock - start;
+-	/*
+-	 * Stagger updates to ls->delta_fair. Very frequent updates
+-	 * can be expensive.
+-	 */
+-	if (ls->delta_stat >= sysctl_sched_stat_granularity)
+-		__update_curr_load(rq, ls);
+-}
+-
+ static inline void inc_load(struct rq *rq, const struct task_struct *p)
+ {
+-	update_curr_load(rq);
+-	update_load_add(&rq->ls.load, p->se.load.weight);
++	update_load_add(&rq->load, p->se.load.weight);
+ }
+ 
+ static inline void dec_load(struct rq *rq, const struct task_struct *p)
+ {
+-	update_curr_load(rq);
+-	update_load_sub(&rq->ls.load, p->se.load.weight);
++	update_load_sub(&rq->load, p->se.load.weight);
+ }
+ 
+ static void inc_nr_running(struct task_struct *p, struct rq *rq)
+ {
+ 	rq->nr_running++;
+@@ -856,12 +918,10 @@ static void dec_nr_running(struct task_s
+ 	dec_load(rq, p);
+ }
+ 
+ static void set_load_weight(struct task_struct *p)
+ {
+-	p->se.wait_runtime = 0;
+-
+ 	if (task_has_rt_policy(p)) {
+ 		p->se.load.weight = prio_to_weight[0] * 2;
+ 		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
+ 		return;
+ 	}
+@@ -949,24 +1009,10 @@ static void activate_task(struct rq *rq,
+ 	enqueue_task(rq, p, wakeup);
+ 	inc_nr_running(p, rq);
+ }
+ 
+ /*
+- * activate_idle_task - move idle task to the _front_ of runqueue.
+- */
+-static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
+-{
+-	update_rq_clock(rq);
+-
+-	if (p->state == TASK_UNINTERRUPTIBLE)
+-		rq->nr_uninterruptible--;
+-
+-	enqueue_task(rq, p, 0);
+-	inc_nr_running(p, rq);
+-}
+-
+-/*
+  * deactivate_task - remove a task from the runqueue.
+  */
+ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+ {
+ 	if (p->state == TASK_UNINTERRUPTIBLE)
+@@ -986,45 +1032,76 @@ inline int task_curr(const struct task_s
+ }
+ 
+ /* Used instead of source_load when we know the type == 0 */
+ unsigned long weighted_cpuload(const int cpu)
+ {
+-	return cpu_rq(cpu)->ls.load.weight;
++	return cpu_rq(cpu)->load.weight;
+ }
+ 
+ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+ {
++	set_task_cfs_rq(p, cpu);
+ #ifdef CONFIG_SMP
++	/*
++	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
++	 * successfuly executed on another CPU. We must ensure that updates of
++	 * per-task data have been completed by this moment.
++	 */
++	smp_wmb();
+ 	task_thread_info(p)->cpu = cpu;
+-	set_task_cfs_rq(p);
+ #endif
+ }
+ 
+ #ifdef CONFIG_SMP
+ 
++/*
++ * Is this task likely cache-hot:
++ */
++static inline int
++task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
++{
++	s64 delta;
++
++	if (p->sched_class