From 4e64c8dfdb92fa358349c24969c617039ab88cd3 Mon Sep 17 00:00:00 2001
From: Leon Woestenberg <leon.woestenberg@gmail.com>
Date: Sun, 6 May 2007 15:32:33 +0000
Subject: linux-efika: Add kernel 2.6.20.11 with CFS scheduler.

---
 .../linux/linux-efika-2.6.20.11/.mtn2git_empty     |    0
 .../sched-cfs-v9-v2.6.20.11.patch                  | 5590 ++++++++++++++++++++
 packages/linux/linux-efika_2.6.20.11.bb            |   86 +
 3 files changed, 5676 insertions(+)
 create mode 100644 packages/linux/linux-efika-2.6.20.11/.mtn2git_empty
 create mode 100644 packages/linux/linux-efika-2.6.20.11/sched-cfs-v9-v2.6.20.11.patch
 create mode 100644 packages/linux/linux-efika_2.6.20.11.bb
diff --git a/packages/linux/linux-efika-2.6.20.11/.mtn2git_empty b/packages/linux/linux-efika-2.6.20.11/.mtn2git_empty
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/packages/linux/linux-efika-2.6.20.11/sched-cfs-v9-v2.6.20.11.patch b/packages/linux/linux-efika-2.6.20.11/sched-cfs-v9-v2.6.20.11.patch
new file mode 100644
index 0000000000..29071a99ac
--- /dev/null
+++ b/packages/linux/linux-efika-2.6.20.11/sched-cfs-v9-v2.6.20.11.patch
@@ -0,0 +1,5590 @@
+This is the Complete Fair Scheduler (CFS) v9 patch for
+linux 2.6.20.10 patch (rediffed cleanly against .11).
+
+http://people.redhat.com/mingo/cfs-scheduler/
+
+Index: linux-cfs-2.6.20.8.q/Documentation/kernel-parameters.txt
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/Documentation/kernel-parameters.txt
++++ linux-cfs-2.6.20.8.q/Documentation/kernel-parameters.txt
+@@ -914,49 +914,6 @@ and is between 256 and 4096 characters. 
+ 
+ 	mga=		[HW,DRM]
+ 
+-	migration_cost=
+-			[KNL,SMP] debug: override scheduler migration costs
+-			Format: <level-1-usecs>,<level-2-usecs>,...
+-			This debugging option can be used to override the
+-			default scheduler migration cost matrix. The numbers
+-			are indexed by 'CPU domain distance'.
+-			E.g. migration_cost=1000,2000,3000 on an SMT NUMA
+-			box will set up an intra-core migration cost of
+-			1 msec, an inter-core migration cost of 2 msecs,
+-			and an inter-node migration cost of 3 msecs.
+-
+-			WARNING: using the wrong values here can break
+-			scheduler performance, so it's only for scheduler
+-			development purposes, not production environments.
+-
+-	migration_debug=
+-			[KNL,SMP] migration cost auto-detect verbosity
+-			Format=<0|1|2>
+-			If a system's migration matrix reported at bootup
+-			seems erroneous then this option can be used to
+-			increase verbosity of the detection process.
+-			We default to 0 (no extra messages), 1 will print
+-			some more information, and 2 will be really
+-			verbose (probably only useful if you also have a
+-			serial console attached to the system).
+-
+-	migration_factor=
+-			[KNL,SMP] multiply/divide migration costs by a factor
+-			Format=<percent>
+-			This debug option can be used to proportionally
+-			increase or decrease the auto-detected migration
+-			costs for all entries of the migration matrix.
+-			E.g. migration_factor=150 will increase migration
+-			costs by 50%. (and thus the scheduler will be less
+-			eager migrating cache-hot tasks)
+-			migration_factor=80 will decrease migration costs
+-			by 20%. (thus the scheduler will be more eager to
+-			migrate tasks)
+-
+-			WARNING: using the wrong values here can break
+-			scheduler performance, so it's only for scheduler
+-			development purposes, not production environments.
+-
+ 	mousedev.tap_time=
+ 			[MOUSE] Maximum time between finger touching and
+ 			leaving touchpad surface for touch to be considered
+Index: linux-cfs-2.6.20.8.q/Documentation/sched-design-CFS.txt
+===================================================================
+--- /dev/null
++++ linux-cfs-2.6.20.8.q/Documentation/sched-design-CFS.txt
+@@ -0,0 +1,107 @@
++[announce] [patch] Modular Scheduler Core and Completely Fair Scheduler [CFS]
++
++i'm pleased to announce the first release of the "Modular Scheduler Core
++and Completely Fair Scheduler [CFS]" patchset:
++
++   http://redhat.com/~mingo/cfs-scheduler/
++
++This project is a complete rewrite of the Linux task scheduler. My goal
++is to address various feature requests and to fix deficiencies in the
++vanilla scheduler that were suggested/found in the past few years, both
++for desktop scheduling and for server scheduling workloads.
++
++[ QuickStart: apply the patch, recompile, reboot. The new scheduler
++  will be active by default and all tasks will default to the
++  SCHED_NORMAL interactive scheduling class. ]
++
++Highlights are:
++
++ - the introduction of Scheduling Classes: an extensible hierarchy of
++   scheduler modules. These modules encapsulate scheduling policy
++   details and are handled by the scheduler core without the core
++   code assuming about them too much.
++
++ - sched_fair.c implements the 'CFS desktop scheduler': it is a
++   replacement for the vanilla scheduler's SCHED_OTHER interactivity
++   code.
++
++   i'd like to give credit to Con Kolivas for the general approach here:
++   he has proven via RSDL/SD that 'fair scheduling' is possible and that
++   it results in better desktop scheduling. Kudos Con!
++
++   The CFS patch uses a completely different approach and implementation
++   from RSDL/SD. My goal was to make CFS's interactivity quality exceed
++   that of RSDL/SD, which is a high standard to meet :-) Testing
++   feedback is welcome to decide this one way or another. [ and, in any
++   case, all of SD's logic could be added via a kernel/sched_sd.c module
++   as well, if Con is interested in such an approach. ]
++
++   CFS's design is quite radical: it does not use runqueues, it uses a
++   time-ordered rbtree to build a 'timeline' of future task execution,
++   and thus has no 'array switch' artifacts (by which both the vanilla
++   scheduler and RSDL/SD are affected).
++
++   CFS uses nanosecond granularity accounting and does not rely on any
++   jiffies or other HZ detail. Thus the CFS scheduler has no notion of
++   'timeslices' and has no heuristics whatsoever. There is only one
++   central tunable:
++
++         /proc/sys/kernel/sched_granularity_ns
++
++   which can be used to tune the scheduler from 'desktop' (low
++   latencies) to 'server' (good batching) workloads. It defaults to a
++   setting suitable for desktop workloads. SCHED_BATCH is handled by the
++   CFS scheduler module too.
++
++   due to its design, the CFS scheduler is not prone to any of the
++   'attacks' that exist today against the heuristics of the stock
++   scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all
++   work fine and do not impact interactivity and produce the expected
++   behavior.
++
++   the CFS scheduler has a much stronger handling of nice levels and
++   SCHED_BATCH: both types of workloads should be isolated much more
++   agressively than under the vanilla scheduler.
++
++   ( another rdetail: due to nanosec accounting and timeline sorting,
++     sched_yield() support is very simple under CFS, and in fact under
++     CFS sched_yield() behaves much better than under any other
++     scheduler i have tested so far. )
++
++ - sched_rt.c implements SCHED_FIFO and SCHED_RR semantics, in a simpler
++   way than the vanilla scheduler does. It uses 100 runqueues (for all
++   100 RT priority levels, instead of 140 in the vanilla scheduler)
++   and it needs no expired array.
++
++ - reworked/sanitized SMP load-balancing: the runqueue-walking
++   assumptions are gone from the load-balancing code now, and
++   iterators of the scheduling modules are used. The balancing code got
++   quite a bit simpler as a result.
++
++the core scheduler got smaller by more than 700 lines:
++
++ kernel/sched.c | 1454 ++++++++++++++++------------------------------------------------
++ 1 file changed, 372 insertions(+), 1082 deletions(-)
++
++and even adding all the scheduling modules, the total size impact is
++relatively small:
++
++ 18 files changed, 1454 insertions(+), 1133 deletions(-)
++
++most of the increase is due to extensive comments. The kernel size
++impact is in fact a small negative:
++
++   text    data     bss     dec     hex filename
++  23366    4001      24   27391    6aff kernel/sched.o.vanilla
++  24159    2705      56   26920    6928 kernel/sched.o.CFS
++
++(this is mainly due to the benefit of getting rid of the expired array
++and its data structure overhead.)
++
++thanks go to Thomas Gleixner and Arjan van de Ven for review of this
++patchset.
++
++as usual, any sort of feedback, bugreports, fixes and suggestions are
++more than welcome,
++
++	Ingo
+Index: linux-cfs-2.6.20.8.q/Makefile
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/Makefile
++++ linux-cfs-2.6.20.8.q/Makefile
+@@ -1,7 +1,7 @@
+ VERSION = 2
+ PATCHLEVEL = 6
+ SUBLEVEL = 20
+-EXTRAVERSION = .11
++EXTRAVERSION = .11-cfs-v9
+ NAME = Homicidal Dwarf Hamster
+ 
+ # *DOCUMENTATION*
+Index: linux-cfs-2.6.20.8.q/arch/i386/kernel/smpboot.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/i386/kernel/smpboot.c
++++ linux-cfs-2.6.20.8.q/arch/i386/kernel/smpboot.c
+@@ -1132,18 +1132,6 @@ exit:
+ }
+ #endif
+ 
+-static void smp_tune_scheduling(void)
+-{
+-	unsigned long cachesize;       /* kB   */
+-
+-	if (cpu_khz) {
+-		cachesize = boot_cpu_data.x86_cache_size;
+-
+-		if (cachesize > 0)
+-			max_cache_size = cachesize * 1024;
+-	}
+-}
+-
+ /*
+  * Cycle through the processors sending APIC IPIs to boot each.
+  */
+@@ -1172,7 +1160,6 @@ static void __init smp_boot_cpus(unsigne
+ 	x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+ 
+ 	current_thread_info()->cpu = 0;
+-	smp_tune_scheduling();
+ 
+ 	set_cpu_sibling_map(0);
+ 
+Index: linux-cfs-2.6.20.8.q/arch/i386/kernel/syscall_table.S
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/i386/kernel/syscall_table.S
++++ linux-cfs-2.6.20.8.q/arch/i386/kernel/syscall_table.S
+@@ -319,3 +319,4 @@ ENTRY(sys_call_table)
+ 	.long sys_move_pages
+ 	.long sys_getcpu
+ 	.long sys_epoll_pwait
++	.long sys_sched_yield_to	/* 320 */
+Index: linux-cfs-2.6.20.8.q/arch/i386/kernel/tsc.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/i386/kernel/tsc.c
++++ linux-cfs-2.6.20.8.q/arch/i386/kernel/tsc.c
+@@ -61,6 +61,8 @@ static inline int check_tsc_unstable(voi
+ 
+ void mark_tsc_unstable(void)
+ {
++	sched_clock_unstable_event();
++
+ 	tsc_unstable = 1;
+ }
+ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+@@ -107,13 +109,7 @@ unsigned long long sched_clock(void)
+ {
+ 	unsigned long long this_offset;
+ 
+-	/*
+-	 * in the NUMA case we dont use the TSC as they are not
+-	 * synchronized across all CPUs.
+-	 */
+-#ifndef CONFIG_NUMA
+-	if (!cpu_khz || check_tsc_unstable())
+-#endif
++	if (!cpu_khz || !cpu_has_tsc)
+ 		/* no locking but a rare wrong value is not a big deal */
+ 		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+ 
+Index: linux-cfs-2.6.20.8.q/arch/ia64/kernel/setup.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/ia64/kernel/setup.c
++++ linux-cfs-2.6.20.8.q/arch/ia64/kernel/setup.c
+@@ -773,7 +773,6 @@ static void __cpuinit
+ get_max_cacheline_size (void)
+ {
+ 	unsigned long line_size, max = 1;
+-	unsigned int cache_size = 0;
+ 	u64 l, levels, unique_caches;
+         pal_cache_config_info_t cci;
+         s64 status;
+@@ -803,8 +802,6 @@ get_max_cacheline_size (void)
+ 		line_size = 1 << cci.pcci_line_size;
+ 		if (line_size > max)
+ 			max = line_size;
+-		if (cache_size < cci.pcci_cache_size)
+-			cache_size = cci.pcci_cache_size;
+ 		if (!cci.pcci_unified) {
+ 			status = ia64_pal_cache_config_info(l,
+ 						    /* cache_type (instruction)= */ 1,
+@@ -821,9 +818,6 @@ get_max_cacheline_size (void)
+ 			ia64_i_cache_stride_shift = cci.pcci_stride;
+ 	}
+   out:
+-#ifdef CONFIG_SMP
+-	max_cache_size = max(max_cache_size, cache_size);
+-#endif
+ 	if (max > ia64_max_cacheline_size)
+ 		ia64_max_cacheline_size = max;
+ }
+Index: linux-cfs-2.6.20.8.q/arch/mips/kernel/smp.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/mips/kernel/smp.c
++++ linux-cfs-2.6.20.8.q/arch/mips/kernel/smp.c
+@@ -245,7 +245,6 @@ void __init smp_prepare_cpus(unsigned in
+ {
+ 	init_new_context(current, &init_mm);
+ 	current_thread_info()->cpu = 0;
+-	smp_tune_scheduling();
+ 	plat_prepare_cpus(max_cpus);
+ #ifndef CONFIG_HOTPLUG_CPU
+ 	cpu_present_map = cpu_possible_map;
+Index: linux-cfs-2.6.20.8.q/arch/sparc/kernel/smp.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/sparc/kernel/smp.c
++++ linux-cfs-2.6.20.8.q/arch/sparc/kernel/smp.c
+@@ -69,16 +69,6 @@ void __cpuinit smp_store_cpu_info(int id
+ 	cpu_data(id).prom_node = cpu_node;
+ 	cpu_data(id).mid = cpu_get_hwmid(cpu_node);
+ 
+-	/* this is required to tune the scheduler correctly */
+-	/* is it possible to have CPUs with different cache sizes? */
+-	if (id == boot_cpu_id) {
+-		int cache_line,cache_nlines;
+-		cache_line = 0x20;
+-		cache_line = prom_getintdefault(cpu_node, "ecache-line-size", cache_line);
+-		cache_nlines = 0x8000;
+-		cache_nlines = prom_getintdefault(cpu_node, "ecache-nlines", cache_nlines);
+-		max_cache_size = cache_line * cache_nlines;
+-	}
+ 	if (cpu_data(id).mid < 0)
+ 		panic("No MID found for CPU%d at node 0x%08d", id, cpu_node);
+ }
+Index: linux-cfs-2.6.20.8.q/arch/sparc64/kernel/smp.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/arch/sparc64/kernel/smp.c
++++ linux-cfs-2.6.20.8.q/arch/sparc64/kernel/smp.c
+@@ -1293,41 +1293,6 @@ int setup_profiling_timer(unsigned int m
+ 	return 0;
+ }
+ 
+-static void __init smp_tune_scheduling(void)
+-{
+-	struct device_node *dp;
+-	int instance;
+-	unsigned int def, smallest = ~0U;
+-
+-	def = ((tlb_type == hypervisor) ?
+-	       (3 * 1024 * 1024) :
+-	       (4 * 1024 * 1024));
+-
+-	instance = 0;
+-	while (!cpu_find_by_instance(instance, &dp, NULL)) {
+-		unsigned int val;
+-
+-		val = of_getintprop_default(dp, "ecache-size", def);
+-		if (val < smallest)
+-			smallest = val;
+-
+-		instance++;
+-	}
+-
+-	/* Any value less than 256K is nonsense.  */
+-	if (smallest < (256U * 1024U))
+-		smallest = 256 * 1024;
+-
+-	max_cache_size = smallest;
+-
+-	if (smallest < 1U * 1024U * 1024U)
+-		printk(KERN_INFO "Using max_cache_size of %uKB\n",
+-		       smallest / 1024U);
+-	else
+-		printk(KERN_INFO "Using max_cache_size of %uMB\n",
+-		       smallest / 1024U / 1024U);
+-}
+-
+ /* Constrain the number of cpus to max_cpus.  */
+ void __init smp_prepare_cpus(unsigned int max_cpus)
+ {
+@@ -1363,7 +1328,6 @@ void __init smp_prepare_cpus(unsigned in
+ 	}
+ 
+ 	smp_store_cpu_info(boot_cpu_id);
+-	smp_tune_scheduling();
+ }
+ 
+ /* Set this up early so that things like the scheduler can init
+Index: linux-cfs-2.6.20.8.q/fs/proc/array.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/fs/proc/array.c
++++ linux-cfs-2.6.20.8.q/fs/proc/array.c
+@@ -165,7 +165,6 @@ static inline char * task_state(struct t
+ 	rcu_read_lock();
+ 	buffer += sprintf(buffer,
+ 		"State:\t%s\n"
+-		"SleepAVG:\t%lu%%\n"
+ 		"Tgid:\t%d\n"
+ 		"Pid:\t%d\n"
+ 		"PPid:\t%d\n"
+@@ -173,9 +172,8 @@ static inline char * task_state(struct t
+ 		"Uid:\t%d\t%d\t%d\t%d\n"
+ 		"Gid:\t%d\t%d\t%d\t%d\n",
+ 		get_task_state(p),
+-		(p->sleep_avg/1024)*100/(1020000000/1024),
+-	       	p->tgid, p->pid,
+-	       	pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
++		p->tgid, p->pid,
++		pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
+ 		pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
+ 		p->uid, p->euid, p->suid, p->fsuid,
+ 		p->gid, p->egid, p->sgid, p->fsgid);
+@@ -312,6 +310,11 @@ int proc_pid_status(struct task_struct *
+ 	return buffer - orig;
+ }
+ 
++int proc_pid_sched(struct task_struct *task, char *buffer)
++{
++	return sched_print_task_state(task, buffer) - buffer;
++}
++
+ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
+ {
+ 	unsigned long vsize, eip, esp, wchan = ~0UL;
+Index: linux-cfs-2.6.20.8.q/fs/proc/base.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/fs/proc/base.c
++++ linux-cfs-2.6.20.8.q/fs/proc/base.c
+@@ -1839,6 +1839,7 @@ static struct pid_entry tgid_base_stuff[
+ 	INF("environ",    S_IRUSR, pid_environ),
+ 	INF("auxv",       S_IRUSR, pid_auxv),
+ 	INF("status",     S_IRUGO, pid_status),
++	INF("sched",      S_IRUGO, pid_sched),
+ 	INF("cmdline",    S_IRUGO, pid_cmdline),
+ 	INF("stat",       S_IRUGO, tgid_stat),
+ 	INF("statm",      S_IRUGO, pid_statm),
+@@ -2121,6 +2122,7 @@ static struct pid_entry tid_base_stuff[]
+ 	INF("environ",   S_IRUSR, pid_environ),
+ 	INF("auxv",      S_IRUSR, pid_auxv),
+ 	INF("status",    S_IRUGO, pid_status),
++	INF("sched",     S_IRUGO, pid_sched),
+ 	INF("cmdline",   S_IRUGO, pid_cmdline),
+ 	INF("stat",      S_IRUGO, tid_stat),
+ 	INF("statm",     S_IRUGO, pid_statm),
+Index: linux-cfs-2.6.20.8.q/fs/proc/internal.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/fs/proc/internal.h
++++ linux-cfs-2.6.20.8.q/fs/proc/internal.h
+@@ -36,6 +36,7 @@ extern int proc_exe_link(struct inode *,
+ extern int proc_tid_stat(struct task_struct *,  char *);
+ extern int proc_tgid_stat(struct task_struct *, char *);
+ extern int proc_pid_status(struct task_struct *, char *);
++extern int proc_pid_sched(struct task_struct *, char *);
+ extern int proc_pid_statm(struct task_struct *, char *);
+ 
+ extern struct file_operations proc_maps_operations;
+Index: linux-cfs-2.6.20.8.q/include/asm-generic/bitops/sched.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/include/asm-generic/bitops/sched.h
++++ linux-cfs-2.6.20.8.q/include/asm-generic/bitops/sched.h
+@@ -6,28 +6,23 @@
+ 
+ /*
+  * Every architecture must define this function. It's the fastest
+- * way of searching a 140-bit bitmap where the first 100 bits are
+- * unlikely to be set. It's guaranteed that at least one of the 140
+- * bits is cleared.
++ * way of searching a 100-bit bitmap.  It's guaranteed that at least
++ * one of the 100 bits is cleared.
+  */
+ static inline int sched_find_first_bit(const unsigned long *b)
+ {
+ #if BITS_PER_LONG == 64
+-	if (unlikely(b[0]))
++	if (b[0])
+ 		return __ffs(b[0]);
+-	if (likely(b[1]))
+-		return __ffs(b[1]) + 64;
+-	return __ffs(b[2]) + 128;
++	return __ffs(b[1]) + 64;
+ #elif BITS_PER_LONG == 32
+-	if (unlikely(b[0]))
++	if (b[0])
+ 		return __ffs(b[0]);
+-	if (unlikely(b[1]))
++	if (b[1])
+ 		return __ffs(b[1]) + 32;
+-	if (unlikely(b[2]))
++	if (b[2])
+ 		return __ffs(b[2]) + 64;
+-	if (b[3])
+-		return __ffs(b[3]) + 96;
+-	return __ffs(b[4]) + 128;
++	return __ffs(b[3]) + 96;
+ #else
+ #error BITS_PER_LONG not defined
+ #endif
+Index: linux-cfs-2.6.20.8.q/include/asm-i386/topology.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/include/asm-i386/topology.h
++++ linux-cfs-2.6.20.8.q/include/asm-i386/topology.h
+@@ -85,7 +85,6 @@ static inline int node_to_first_cpu(int 
+ 	.idle_idx		= 1,			\
+ 	.newidle_idx		= 2,			\
+ 	.wake_idx		= 1,			\
+-	.per_cpu_gain		= 100,			\
+ 	.flags			= SD_LOAD_BALANCE	\
+ 				| SD_BALANCE_EXEC	\
+ 				| SD_BALANCE_FORK	\
+Index: linux-cfs-2.6.20.8.q/include/asm-i386/unistd.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/include/asm-i386/unistd.h
++++ linux-cfs-2.6.20.8.q/include/asm-i386/unistd.h
+@@ -325,10 +325,11 @@
+ #define __NR_move_pages		317
+ #define __NR_getcpu		318
+ #define __NR_epoll_pwait	319
++#define __NR_sched_yield_to	320
+ 
+ #ifdef __KERNEL__
+ 
+-#define NR_syscalls 320
++#define NR_syscalls 321
+ 
+ #define __ARCH_WANT_IPC_PARSE_VERSION
+ #define __ARCH_WANT_OLD_READDIR
+Index: linux-cfs-2.6.20.8.q/include/asm-ia64/topology.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/include/asm-ia64/topology.h
++++ linux-cfs-2.6.20.8.q/include/asm-ia64/topology.h
+@@ -65,7 +65,6 @@ void build_cpu_to_node_map(void);
+ 	.max_interval		= 4,			\
+ 	.busy_factor		= 64,			\
+ 	.imbalance_pct		= 125,			\
+-	.per_cpu_gain		= 100,			\
+ 	.cache_nice_tries	= 2,			\
+ 	.busy_idx		= 2,			\
+ 	.idle_idx		= 1,			\
+@@ -97,7 +96,6 @@ void build_cpu_to_node_map(void);
+ 	.newidle_idx		= 0, /* unused */	\
+ 	.wake_idx		= 1,			\
+ 	.forkexec_idx		= 1,			\
+-	.per_cpu_gain		= 100,			\
+ 	.flags			= SD_LOAD_BALANCE	\
+ 				| SD_BALANCE_EXEC	\
+ 				| SD_BALANCE_FORK	\
+Index: linux-cfs-2.6.20.8.q/include/asm-mips/mach-ip27/topology.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/include/asm-mips/mach-ip27/topology.h
++++ linux-cfs-2.6.20.8.q/include/asm-mips/mach-ip27/topology.h
+@@ -28,7 +28,6 @@ extern unsigned char __node_distances[MA
+ 	.busy_factor		= 32,			\
+ 	.imbalance_pct		= 125,			\
+ 	.cache_nice_tries	= 1,			\
+-	.per_cpu_gain		= 100,			\
+ 	.flags			= SD_LOAD_BALANCE	\
+ 				| SD_BALANCE_EXEC	\
+ 				| SD_WAKE_BALANCE,	\
+Index: linux-cfs-2.6.20.8.q/include/asm-powerpc/topology.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/include/asm-powerpc/topology.h
++++ linux-cfs-2.6.20.8.q/include/asm-powerpc/topology.h
+@@ -57,7 +57,6 @@ static inline int pcibus_to_node(struct 
+ 	.busy_factor		= 32,			\
+ 	.imbalance_pct		= 125,			\
+ 	.cache_nice_tries	= 1,			\
+-	.per_cpu_gain		= 100,			\
+ 	.busy_idx		= 3,			\
+ 	.idle_idx		= 1,			\
+ 	.newidle_idx		= 2,			\
+Index: linux-cfs-2.6.20.8.q/include/asm-x86_64/topology.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/include/asm-x86_64/topology.h
++++ linux-cfs-2.6.20.8.q/include/asm-x86_64/topology.h
+@@ -43,7 +43,6 @@ extern int __node_distance(int, int);
+ 	.newidle_idx		= 0, 			\
+ 	.wake_idx		= 1,			\
+ 	.forkexec_idx		= 1,			\
+-	.per_cpu_gain		= 100,			\
+ 	.flags			= SD_LOAD_BALANCE	\
+ 				| SD_BALANCE_FORK	\
+ 				| SD_BALANCE_EXEC	\
+Index: linux-cfs-2.6.20.8.q/include/asm-x86_64/unistd.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/include/asm-x86_64/unistd.h
++++ linux-cfs-2.6.20.8.q/include/asm-x86_64/unistd.h
+@@ -619,8 +619,10 @@ __SYSCALL(__NR_sync_file_range, sys_sync
+ __SYSCALL(__NR_vmsplice, sys_vmsplice)
+ #define __NR_move_pages		279
+ __SYSCALL(__NR_move_pages, sys_move_pages)
++#define __NR_sched_yield_to	280
++__SYSCALL(__NR_sched_yield_to, sys_sched_yield_to)
+ 
+-#define __NR_syscall_max __NR_move_pages
++#define __NR_syscall_max __NR_sched_yield_to
+ 
+ #ifndef __NO_STUBS
+ #define __ARCH_WANT_OLD_READDIR
+Index: linux-cfs-2.6.20.8.q/include/linux/hardirq.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/include/linux/hardirq.h
++++ linux-cfs-2.6.20.8.q/include/linux/hardirq.h
+@@ -79,6 +79,19 @@
+ #endif
+ 
+ #ifdef CONFIG_PREEMPT
++# define PREEMPT_CHECK_OFFSET 1
++#else
++# define PREEMPT_CHECK_OFFSET 0
++#endif
++
++/*
++ * Check whether we were atomic before we did preempt_disable():
++ * (used by the scheduler)
++ */
++#define in_atomic_preempt_off() \
++		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
++
++#ifdef CONFIG_PREEMPT
+ # define preemptible()	(preempt_count() == 0 && !irqs_disabled())
+ # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
+ #else
+Index: linux-cfs-2.6.20.8.q/include/linux/ktime.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/include/linux/ktime.h
++++ linux-cfs-2.6.20.8.q/include/linux/ktime.h
+@@ -274,4 +274,6 @@ extern void ktime_get_ts(struct timespec
+ /* Get the real (wall-) time in timespec format: */
+ #define ktime_get_real_ts(ts)	getnstimeofday(ts)
+ 
++extern ktime_t ktime_get(void);
++
+ #endif
+Index: linux-cfs-2.6.20.8.q/include/linux/sched.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/include/linux/sched.h
++++ linux-cfs-2.6.20.8.q/include/linux/sched.h
+@@ -2,7 +2,6 @@
+ #define _LINUX_SCHED_H
+ 
+ #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
+-
+ /*
+  * cloning flags:
+  */
+@@ -37,6 +36,8 @@
+ 
+ #ifdef __KERNEL__
+ 
++#include <linux/rbtree.h>	/* For run_node */
++
+ struct sched_param {
+ 	int sched_priority;
+ };
+@@ -196,13 +197,13 @@ extern void init_idle(struct task_struct
+ extern cpumask_t nohz_cpu_mask;
+ 
+ /*
+- * Only dump TASK_* tasks. (-1 for all tasks)
++ * Only dump TASK_* tasks. (0 for all tasks)
+  */
+ extern void show_state_filter(unsigned long state_filter);
+ 
+ static inline void show_state(void)
+ {
+-	show_state_filter(-1);
++	show_state_filter(0);
+ }
+ 
+ extern void show_regs(struct pt_regs *);
+@@ -464,7 +465,7 @@ struct signal_struct {
+ 	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
+ 	 * other than jiffies.)
+ 	 */
+-	unsigned long long sched_time;
++	unsigned long long sum_sched_runtime;
+ 
+ 	/*
+ 	 * We don't bother to synchronize most readers of this at all,
+@@ -524,6 +525,7 @@ struct signal_struct {
+ #define MAX_RT_PRIO		MAX_USER_RT_PRIO
+ 
+ #define MAX_PRIO		(MAX_RT_PRIO + 40)
++#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
+ 
+ #define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
+ #define rt_task(p)		rt_prio((p)->prio)
+@@ -635,7 +637,14 @@ enum idle_type
+ /*
+  * sched-domains (multiprocessor balancing) declarations:
+  */
+-#define SCHED_LOAD_SCALE	128UL	/* increase resolution of load */
++
++/*
++ * Increase resolution of nice-level calculations:
++ */
++#define SCHED_LOAD_SHIFT	10
++#define SCHED_LOAD_SCALE	(1UL << SCHED_LOAD_SHIFT)
++
++#define SCHED_LOAD_SCALE_FUZZ	(SCHED_LOAD_SCALE >> 5)
+ 
+ #ifdef CONFIG_SMP
+ #define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
+@@ -684,7 +693,6 @@ struct sched_domain {
+ 	unsigned int imbalance_pct;	/* No balance until over watermark */
+ 	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
+ 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
+-	unsigned int per_cpu_gain;	/* CPU % gained by adding domain cpus */
+ 	unsigned int busy_idx;
+ 	unsigned int idle_idx;
+ 	unsigned int newidle_idx;
+@@ -733,12 +741,6 @@ struct sched_domain {
+ extern int partition_sched_domains(cpumask_t *partition1,
+ 				    cpumask_t *partition2);
+ 
+-/*
+- * Maximum cache size the migration-costs auto-tuning code will
+- * search from:
+- */
+-extern unsigned int max_cache_size;
+-
+ #endif	/* CONFIG_SMP */
+ 
+ 
+@@ -789,14 +791,28 @@ struct mempolicy;
+ struct pipe_inode_info;
+ struct uts_namespace;
+ 
+-enum sleep_type {
+-	SLEEP_NORMAL,
+-	SLEEP_NONINTERACTIVE,
+-	SLEEP_INTERACTIVE,
+-	SLEEP_INTERRUPTED,
+-};
++struct rq;
+ 
+-struct prio_array;
++struct sched_class {
++	struct sched_class *next;
++
++	void (*enqueue_task) (struct rq *rq, struct task_struct *p,
++			      int wakeup, u64 now);
++	void (*dequeue_task) (struct rq *rq, struct task_struct *p,
++			      int sleep, u64 now);
++	void (*yield_task) (struct rq *rq, struct task_struct *p,
++			    struct task_struct *p_to);
++
++	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
++
++	struct task_struct * (*pick_next_task) (struct rq *rq, u64 now);
++	void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
++
++	struct task_struct * (*load_balance_start) (struct rq *rq);
++	struct task_struct * (*load_balance_next) (struct rq *rq);
++	void (*task_tick) (struct rq *rq, struct task_struct *p);
++	void (*task_new) (struct rq *rq, struct task_struct *p);
++};
+ 
+ struct task_struct {
+ 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
+@@ -813,26 +829,45 @@ struct task_struct {
+ #endif
+ #endif
+ 	int load_weight;	/* for niceness load balancing purposes */
++	int load_shift;
++
+ 	int prio, static_prio, normal_prio;
++	int on_rq;
+ 	struct list_head run_list;
+-	struct prio_array *array;
++	struct rb_node run_node;
+ 
+ 	unsigned short ioprio;
+ #ifdef CONFIG_BLK_DEV_IO_TRACE
+ 	unsigned int btrace_seq;
+ #endif
+-	unsigned long sleep_avg;
+-	unsigned long long timestamp, last_ran;
+-	unsigned long long sched_time; /* sched_clock time spent running */
+-	enum sleep_type sleep_type;
++	/* CFS scheduling class statistics fields: */
++	u64 wait_start_fair;
++	u64 wait_start;
++	u64 exec_start;
++	u64 sleep_start;
++	u64 block_start;
++	u64 sleep_max;
++	u64 block_max;
++	u64 exec_max;
++	u64 wait_max;
++	u64 last_ran;
++
++	s64 wait_runtime;
++	u64 sum_exec_runtime;
++	s64 fair_key;
++	s64 sum_wait_runtime;
+ 
+ 	unsigned long policy;
+ 	cpumask_t cpus_allowed;
+-	unsigned int time_slice, first_time_slice;
++	unsigned int time_slice;
++	struct sched_class *sched_class;
++
++	s64 min_wait_runtime;
+ 
+ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ 	struct sched_info sched_info;
+ #endif
++	u64 nr_switches;
+ 
+ 	struct list_head tasks;
+ 	/*
+@@ -1195,8 +1230,9 @@ static inline int set_cpus_allowed(struc
+ #endif
+ 
+ extern unsigned long long sched_clock(void);
++extern void sched_clock_unstable_event(void);
+ extern unsigned long long
+-current_sched_time(const struct task_struct *current_task);
++current_sched_runtime(const struct task_struct *current_task);
+ 
+ /* sched_exec is called by processes performing an exec */
+ #ifdef CONFIG_SMP
+@@ -1212,6 +1248,13 @@ static inline void idle_task_exit(void) 
+ #endif
+ 
+ extern void sched_idle_next(void);
++extern char * sched_print_task_state(struct task_struct *p, char *buffer);
++
++extern unsigned int sysctl_sched_granularity;
++extern unsigned int sysctl_sched_wakeup_granularity;
++extern unsigned int sysctl_sched_sleep_history_max;
++extern unsigned int sysctl_sched_child_runs_first;
++extern unsigned int sysctl_sched_load_smoothing;
+ 
+ #ifdef CONFIG_RT_MUTEXES
+ extern int rt_mutex_getprio(struct task_struct *p);
+@@ -1290,8 +1333,7 @@ extern void FASTCALL(wake_up_new_task(st
+ #else
+  static inline void kick_process(struct task_struct *tsk) { }
+ #endif
+-extern void FASTCALL(sched_fork(struct task_struct * p, int clone_flags));
+-extern void FASTCALL(sched_exit(struct task_struct * p));
++extern void sched_fork(struct task_struct * p, int clone_flags);
+ 
+ extern int in_group_p(gid_t);
+ extern int in_egroup_p(gid_t);
+Index: linux-cfs-2.6.20.8.q/include/linux/topology.h
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/include/linux/topology.h
++++ linux-cfs-2.6.20.8.q/include/linux/topology.h
+@@ -96,7 +96,6 @@
+ 	.busy_factor		= 64,			\
+ 	.imbalance_pct		= 110,			\
+ 	.cache_nice_tries	= 0,			\
+-	.per_cpu_gain		= 25,			\
+ 	.busy_idx		= 0,			\
+ 	.idle_idx		= 0,			\
+ 	.newidle_idx		= 1,			\
+@@ -128,7 +127,6 @@
+ 	.busy_factor		= 64,			\
+ 	.imbalance_pct		= 125,			\
+ 	.cache_nice_tries	= 1,			\
+-	.per_cpu_gain		= 100,			\
+ 	.busy_idx		= 2,			\
+ 	.idle_idx		= 1,			\
+ 	.newidle_idx		= 2,			\
+@@ -159,7 +157,6 @@
+ 	.busy_factor		= 64,			\
+ 	.imbalance_pct		= 125,			\
+ 	.cache_nice_tries	= 1,			\
+-	.per_cpu_gain		= 100,			\
+ 	.busy_idx		= 2,			\
+ 	.idle_idx		= 1,			\
+ 	.newidle_idx		= 2,			\
+@@ -193,7 +190,6 @@
+ 	.newidle_idx		= 0, /* unused */	\
+ 	.wake_idx		= 0, /* unused */	\
+ 	.forkexec_idx		= 0, /* unused */	\
+-	.per_cpu_gain		= 100,			\
+ 	.flags			= SD_LOAD_BALANCE	\
+ 				| SD_SERIALIZE,	\
+ 	.last_balance		= jiffies,		\
+Index: linux-cfs-2.6.20.8.q/init/main.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/init/main.c
++++ linux-cfs-2.6.20.8.q/init/main.c
+@@ -422,7 +422,7 @@ static void noinline rest_init(void)
+ 
+ 	/*
+ 	 * The boot idle thread must execute schedule()
+-	 * at least one to get things moving:
++	 * at least once to get things moving:
+ 	 */
+ 	preempt_enable_no_resched();
+ 	schedule();
+Index: linux-cfs-2.6.20.8.q/kernel/exit.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/kernel/exit.c
++++ linux-cfs-2.6.20.8.q/kernel/exit.c
+@@ -112,7 +112,7 @@ static void __exit_signal(struct task_st
+ 		sig->maj_flt += tsk->maj_flt;
+ 		sig->nvcsw += tsk->nvcsw;
+ 		sig->nivcsw += tsk->nivcsw;
+-		sig->sched_time += tsk->sched_time;
++		sig->sum_sched_runtime += tsk->sum_exec_runtime;
+ 		sig = NULL; /* Marker for below. */
+ 	}
+ 
+@@ -170,7 +170,6 @@ repeat:
+ 		zap_leader = (leader->exit_signal == -1);
+ 	}
+ 
+-	sched_exit(p);
+ 	write_unlock_irq(&tasklist_lock);
+ 	proc_flush_task(p);
+ 	release_thread(p);
+Index: linux-cfs-2.6.20.8.q/kernel/fork.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/kernel/fork.c
++++ linux-cfs-2.6.20.8.q/kernel/fork.c
+@@ -874,7 +874,7 @@ static inline int copy_signal(unsigned l
+ 	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+ 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
+ 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+-	sig->sched_time = 0;
++	sig->sum_sched_runtime = 0;
+ 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
+ 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
+ 	INIT_LIST_HEAD(&sig->cpu_timers[2]);
+@@ -1037,7 +1037,7 @@ static struct task_struct *copy_process(
+ 
+ 	p->utime = cputime_zero;
+ 	p->stime = cputime_zero;
+- 	p->sched_time = 0;
++
+ 	p->rchar = 0;		/* I/O counter: bytes read */
+ 	p->wchar = 0;		/* I/O counter: bytes written */
+ 	p->syscr = 0;		/* I/O counter: read syscalls */
+Index: linux-cfs-2.6.20.8.q/kernel/hrtimer.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/kernel/hrtimer.c
++++ linux-cfs-2.6.20.8.q/kernel/hrtimer.c
+@@ -45,7 +45,7 @@
+  *
+  * returns the time in ktime_t format
+  */
+-static ktime_t ktime_get(void)
++ktime_t ktime_get(void)
+ {
+ 	struct timespec now;
+ 
+Index: linux-cfs-2.6.20.8.q/kernel/posix-cpu-timers.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/kernel/posix-cpu-timers.c
++++ linux-cfs-2.6.20.8.q/kernel/posix-cpu-timers.c
+@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struc
+ }
+ static inline unsigned long long sched_ns(struct task_struct *p)
+ {
+-	return (p == current) ? current_sched_time(p) : p->sched_time;
++	return (p == current) ? current_sched_runtime(p) : p->sum_exec_runtime;
+ }
+ 
+ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+@@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked
+ 		} while (t != p);
+ 		break;
+ 	case CPUCLOCK_SCHED:
+-		cpu->sched = p->signal->sched_time;
++		cpu->sched = p->signal->sum_sched_runtime;
+ 		/* Add in each other live thread.  */
+ 		while ((t = next_thread(t)) != p) {
+-			cpu->sched += t->sched_time;
++			cpu->sched += t->sum_exec_runtime;
+ 		}
+ 		cpu->sched += sched_ns(p);
+ 		break;
+@@ -417,7 +417,7 @@ int posix_cpu_timer_del(struct k_itimer 
+  */
+ static void cleanup_timers(struct list_head *head,
+ 			   cputime_t utime, cputime_t stime,
+-			   unsigned long long sched_time)
++			   unsigned long long sum_exec_runtime)
+ {
+ 	struct cpu_timer_list *timer, *next;
+ 	cputime_t ptime = cputime_add(utime, stime);
+@@ -446,10 +446,10 @@ static void cleanup_timers(struct list_h
+ 	++head;
+ 	list_for_each_entry_safe(timer, next, head, entry) {
+ 		list_del_init(&timer->entry);
+-		if (timer->expires.sched < sched_time) {
++		if (timer->expires.sched < sum_exec_runtime) {
+ 			timer->expires.sched = 0;
+ 		} else {
+-			timer->expires.sched -= sched_time;
++			timer->expires.sched -= sum_exec_runtime;
+ 		}
+ 	}
+ }
+@@ -462,7 +462,7 @@ static void cleanup_timers(struct list_h
+ void posix_cpu_timers_exit(struct task_struct *tsk)
+ {
+ 	cleanup_timers(tsk->cpu_timers,
+-		       tsk->utime, tsk->stime, tsk->sched_time);
++		       tsk->utime, tsk->stime, tsk->sum_exec_runtime);
+ 
+ }
+ void posix_cpu_timers_exit_group(struct task_struct *tsk)
+@@ -470,7 +470,7 @@ void posix_cpu_timers_exit_group(struct 
+ 	cleanup_timers(tsk->signal->cpu_timers,
+ 		       cputime_add(tsk->utime, tsk->signal->utime),
+ 		       cputime_add(tsk->stime, tsk->signal->stime),
+-		       tsk->sched_time + tsk->signal->sched_time);
++		       tsk->sum_exec_runtime + tsk->signal->sum_sched_runtime);
+ }
+ 
+ 
+@@ -531,7 +531,7 @@ static void process_timer_rebalance(stru
+ 		nsleft = max_t(unsigned long long, nsleft, 1);
+ 		do {
+ 			if (likely(!(t->flags & PF_EXITING))) {
+-				ns = t->sched_time + nsleft;
++				ns = t->sum_exec_runtime + nsleft;
+ 				if (t->it_sched_expires == 0 ||
+ 				    t->it_sched_expires > ns) {
+ 					t->it_sched_expires = ns;
+@@ -999,7 +999,7 @@ static void check_thread_timers(struct t
+ 		struct cpu_timer_list *t = list_entry(timers->next,
+ 						      struct cpu_timer_list,
+ 						      entry);
+-		if (!--maxfire || tsk->sched_time < t->expires.sched) {
++		if (!--maxfire || tsk->sum_exec_runtime < t->expires.sched) {
+ 			tsk->it_sched_expires = t->expires.sched;
+ 			break;
+ 		}
+@@ -1019,7 +1019,7 @@ static void check_process_timers(struct 
+ 	int maxfire;
+ 	struct signal_struct *const sig = tsk->signal;
+ 	cputime_t utime, stime, ptime, virt_expires, prof_expires;
+-	unsigned long long sched_time, sched_expires;
++	unsigned long long sum_sched_runtime, sched_expires;
+ 	struct task_struct *t;
+ 	struct list_head *timers = sig->cpu_timers;
+ 
+@@ -1039,12 +1039,12 @@ static void check_process_timers(struct 
+ 	 */
+ 	utime = sig->utime;
+ 	stime = sig->stime;
+-	sched_time = sig->sched_time;
++	sum_sched_runtime = sig->sum_sched_runtime;
+ 	t = tsk;
+ 	do {
+ 		utime = cputime_add(utime, t->utime);
+ 		stime = cputime_add(stime, t->stime);
+-		sched_time += t->sched_time;
++		sum_sched_runtime += t->sum_exec_runtime;
+ 		t = next_thread(t);
+ 	} while (t != tsk);
+ 	ptime = cputime_add(utime, stime);
+@@ -1085,7 +1085,7 @@ static void check_process_timers(struct 
+ 		struct cpu_timer_list *t = list_entry(timers->next,
+ 						      struct cpu_timer_list,
+ 						      entry);
+-		if (!--maxfire || sched_time < t->expires.sched) {
++		if (!--maxfire || sum_sched_runtime < t->expires.sched) {
+ 			sched_expires = t->expires.sched;
+ 			break;
+ 		}
+@@ -1177,7 +1177,7 @@ static void check_process_timers(struct 
+ 		virt_left = cputime_sub(virt_expires, utime);
+ 		virt_left = cputime_div_non_zero(virt_left, nthreads);
+ 		if (sched_expires) {
+-			sched_left = sched_expires - sched_time;
++			sched_left = sched_expires - sum_sched_runtime;
+ 			do_div(sched_left, nthreads);
+ 			sched_left = max_t(unsigned long long, sched_left, 1);
+ 		} else {
+@@ -1203,7 +1203,7 @@ static void check_process_timers(struct 
+ 				t->it_virt_expires = ticks;
+ 			}
+ 
+-			sched = t->sched_time + sched_left;
++			sched = t->sum_exec_runtime + sched_left;
+ 			if (sched_expires && (t->it_sched_expires == 0 ||
+ 					      t->it_sched_expires > sched)) {
+ 				t->it_sched_expires = sched;
+@@ -1295,7 +1295,7 @@ void run_posix_cpu_timers(struct task_st
+ 
+ 	if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
+ 	    (tsk->it_sched_expires == 0 ||
+-	     tsk->sched_time < tsk->it_sched_expires))
++	     tsk->sum_exec_runtime < tsk->it_sched_expires))
+ 		return;
+ 
+ #undef	UNEXPIRED
+Index: linux-cfs-2.6.20.8.q/kernel/sched.c
+===================================================================
+--- linux-cfs-2.6.20.8.q.orig/kernel/sched.c
++++ linux-cfs-2.6.20.8.q/kernel/sched.c
+@@ -89,110 +89,13 @@
+  */
+ #define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
+ #define DEF_TIMESLICE		(100 * HZ / 1000)
+-#define ON_RUNQUEUE_WEIGHT	 30
+-#define CHILD_PENALTY		 95
+-#define PARENT_PENALTY		100
+-#define EXIT_WEIGHT		  3
+-#define PRIO_BONUS_RATIO	 25
+-#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
+-#define INTERACTIVE_DELTA	  2
+-#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
+-#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
+-#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
+-
+-/*
+- * If a task is 'interactive' then we reinsert it in the active
+- * array after it has expired its current timeslice. (it will not
+- * continue to run immediately, it will still roundrobin with
+- * other interactive tasks.)
+- *
+- * This part scales the interactivity limit depending on niceness.
+- *
+- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
+- * Here are a few examples of different nice levels:
+- *
+- *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
+- *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
+- *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
+- *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
+- *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
+- *
+- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
+- *  priority range a task can explore, a value of '1' means the
+- *  task is rated interactive.)
+- *
+- * Ie. nice +19 tasks can never get 'interactive' enough to be
+- * reinserted into the active array. And only heavily CPU-hog nice -20
+- * tasks will be expired. Default nice 0 tasks are somewhere between,
+- * it takes some effort for them to get interactive, but it's not
+- * too hard.
+- */
+-
+-#define CURRENT_BONUS(p) \
+-	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
+-		MAX_SLEEP_AVG)
+-
+-#define GRANULARITY	(10 * HZ / 1000 ? : 1)
+-
+-#ifdef CONFIG_SMP
+-#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
+-			num_online_cpus())
+-#else
+-#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
+-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
+-#endif
+-
+-#define SCALE(v1,v1_max,v2_max) \
+-	(v1) * (v2_max) / (v1_max)
+-
+-#define DELTA(p) \
+-	(SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
+-		INTERACTIVE_DELTA)
+-
+-#define TASK_INTERACTIVE(p) \
+-	((p)->prio <= (p)->static_prio - DELTA(p))
+-
+-#define INTERACTIVE_SLEEP(p) \
+-	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
+-		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+-
+-#define TASK_PREEMPTS_CURR(p, rq) \
+-	((p)->prio < (rq)->curr->prio)
+-
+-#define SCALE_PRIO(x, prio) \
+-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
+-
+-static unsigned int static_prio_timeslice(int static_prio)
+-{
+-	if (static_prio < NICE_TO_PRIO(0))
+-		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
+-	else
+-		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
+-}
+-
+-/*
+- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+- * to time slice values: [800ms ... 100ms ... 5ms]
+- *
+- * The higher a thread's priority, the bigger timeslices
+- * it gets during one round of execution. But even the lowest
+- * priority thread gets MIN_TIMESLICE worth of execution time.
+- */
+-
+-static inline unsigned int task_timeslice(struct task_struct *p)
+-{
+-	return static_prio_timeslice(p->static_prio);
+-}
+ 
+ /*
+- * These are the runqueue data structures:
++ * This is the priority-queue data structure of the RT scheduling class:
+  */
+-
+ struct prio_array {
+-	unsigned int nr_active;
+-	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
+-	struct list_head queue[MAX_PRIO];
++	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
++	struct list_head queue[MAX_RT_PRIO];
+ };
+ 
+ /*
+@@ -209,12 +112,13 @@ struct rq {
+ 	 * nr_running and cpu_load should be in the same cacheline because
+ 	 * remote CPUs use both these fields when doing load calculation.
+ 	 */
+-	unsigned long nr_running;
++	long nr_running;
+ 	unsigned long raw_weighted_load;
+-#ifdef CONFIG_SMP
+-	unsigned long cpu_load[3];
+-#endif
+-	unsigned long long nr_switches;
++	#define CPU_LOAD_IDX_MAX 5
++	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
++
++	u64 nr_switches;
++	unsigned long nr_load_updates;
+ 
+ 	/*
+ 	 * This is part of a global counter where only the total sum
+@@ -224,14 +128,29 @@ struct rq {
+ 	 */
+ 	unsigned long nr_uninterruptible;
+ 
+-	unsigned long expired_timestamp;
+-	/* Cached timestamp set by update_cpu_clock() */
+-	unsigned long long most_recent_timestamp;
+ 	struct task_struct *curr, *idle;
+ 	unsigned long next_balance;
+ 	struct mm_struct *prev_mm;
+-	struct prio_array *active, *expired, arrays[2];
+-	int best_expired_prio;
++
++	u64 clock, prev_clock_raw;
++	s64 clock_max_delta;
++	u64 fair_clock, prev_fair_clock;
++	u64 exec_clock, prev_exec_clock;
++	u64 wait_runtime;
++
++	unsigned int clock_warps;
++	unsigned int clock_unstable_events;
++
++	struct sched_class *load_balance_class;
++
++	struct prio_array active;
++	int rt_load_balance_idx;
++	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
++
++	struct rb_root tasks_timeline;
++	struct rb_node *rb_leftmost;
++	struct rb_node *rb_load_balance_curr;
++
+ 	atomic_t nr_iowait;
+ 
+ #ifdef CONFIG_SMP
+@@ -268,7 +187,107 @@ struct rq {
+ 	struct lock_class_key rq_lock_key;
+ };
+ 
+-static DEFINE_PER_CPU(struct rq, runqueues);
++static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
++
++static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
++{
++	rq->curr->sched_class->check_preempt_curr(rq, p);
++}
++
++#define SCALE_PRIO(x, prio) \
++	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
++
++/*
++ * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
++ * to time slice values: [800ms ... 100ms ... 5ms]
++ */
++static unsigned int static_prio_timeslice(int static_prio)
++{
++	if (static_prio == NICE_TO_PRIO(19))
++		return 1;
++
++	if (static_prio < NICE_TO_PRIO(0))
++		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
++	else
++		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
++}
++
++/*
++ * Print out various scheduling related per-task fields:
++ */
++char * sched_print_task_state(struct task_struct *p, char *buffer)
++{
++	struct rq *this_rq = &per_cpu(runqueues, raw_smp_processor_id());
++	unsigned long long t0, t1;
++
++#define P(F) \
++	buffer += sprintf(buffer, "%-25s:%20Ld\n", #F, (long long)p->F)
++
++	P(wait_start);
++	P(wait_start_fair);
++	P(exec_start);
++	P(sleep_start);
++	P(block_start);
++	P(sleep_max);
++	P(block_max);
++	P(exec_max);
++	P(wait_max);
++	P(min_wait_runtime);
++	P(last_ran);
++	P(wait_runtime);
++	P(sum_exec_runtime);
++#undef P
++
++	t0 = sched_clock();
++	t1 = sched_clock();
++	buffer += sprintf(buffer, "%-25s:%20Ld\n", "clock-delta",
++				(long long)t1-t0);
++	buffer += sprintf(buffer, "%-25s:%20Ld\n", "rq-wait_runtime",
++				(long long)this_rq->wait_runtime);
++	buffer += sprintf(buffer, "%-25s:%20Ld\n", "rq-exec_clock",
++				(long long)this_rq->exec_clock);
++	buffer += sprintf(buffer, "%-25s:%20Ld\n", "rq-fair_clock",
++				(long long)this_rq->fair_clock);
++	buffer += sprintf(buffer, "%-25s:%20Ld\n", "rq-clock",
++				(long long)this_rq->clock);
++	buffer += sprintf(buffer, "%-25s:%20Ld\n", "rq-prev_clock_raw",
++				(long long)this_rq->prev_clock_raw);
++	buffer += sprintf(buffer, "%-25s:%20Ld\n", "rq-clock_max_delta",
++				(long long)this_rq->clock_max_delta);
++	buffer += sprintf(buffer, "%-25s:%20u\n", "rq-clock_warps",
++				this_rq->clock_warps);
++	buffer += sprintf(buffer, "%-25s:%20u\n", "rq-clock_unstable_events",
++				this_rq->clock_unstable_events);
++	return buffer;
++}
++
++/*
++ * Per-runqueue clock, as finegrained as the platform can give us:
++ */
++static inline unsigned long long __rq_clock(struct rq *rq)
++{
++	u64 now = sched_clock();
++	u64 clock = rq->clock;
++	u64 prev_raw = rq->prev_clock_raw;
++	s64 delta = now - prev_raw;
++
++	/*
++	 * Protect against sched_clock() occasionally going backwards:
++	 */
++	if (unlikely(delta < 0)) {
++		clock++;
++		rq->clock_warps++;
++	} else {
++		if (unlikely(delta > rq->clock_max_delta))
++			rq->clock_max_delta = delta;
++		clock += delta;
++	}
++
++	rq->prev_clock_raw = now;
++	rq->clock = clock;
++
++	return clock;
++}
+ 
+ static inline int cpu_of(struct rq *rq)
+ {
+@@ -279,6 +298,16 @@ static inline int cpu_of(struct rq *rq)
+ #endif
+ }
+ 
++static inline unsigned long long rq_clock(struct rq *rq)
++{
++	int this_cpu = smp_processor_id();
++
++	if (this_cpu == cpu_of(rq))
++		return __rq_clock(rq);
++
++	return rq->clock;
++}
++
+ /*
+  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+  * See detach_destroy_domains: synchronize_sched for details.
+@@ -423,134 +452,6 @@ static inline void task_rq_unlock(struct
+ 	spin_unlock_irqrestore(&rq->lock, *flags);
+ }
+ 
+-#ifdef CONFIG_SCHEDSTATS
+-/*
+- * bump this up when changing the output format or the meaning of an existing
+- * format, so that tools can adapt (or abort)
+- */
+-#define SCHEDSTAT_VERSION 14
+-
+-static int show_schedstat(struct seq_file *seq, void *v)
+-{
+-	int cpu;
+-
+-	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+-	seq_printf(seq, "timestamp %lu\n", jiffies);
+-	for_each_online_cpu(cpu) {
+-		struct rq *rq = cpu_rq(cpu);
+-#ifdef CONFIG_SMP
+-		struct sched_domain *sd;
+-		int dcnt = 0;
+-#endif
+-
+-		/* runqueue-specific stats */
+-		seq_printf(seq,
+-		    "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
+-		    cpu, rq->yld_both_empty,
+-		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
+-		    rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
+-		    rq->ttwu_cnt, rq->ttwu_local,
+-		    rq->rq_sched_info.cpu_time,
+-		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
+-
+-		seq_printf(seq, "\n");
+-
+-#ifdef CONFIG_SMP
+-		/* domain-specific stats */
+-		preempt_disable();
+-		for_each_domain(cpu, sd) {
+-			enum idle_type itype;
+-			char mask_str[NR_CPUS];
+-
+-			cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
+-			seq_printf(seq, "domain%d %s", dcnt++, mask_str);
+-			for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
+-					itype++) {
+-				seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
+-						"%lu",
+-				    sd->lb_cnt[itype],
+-				    sd->lb_balanced[itype],
+-				    sd->lb_failed[itype],
+-				    sd->lb_imbalance[itype],
+-				    sd->lb_gained[itype],
+-				    sd->lb_hot_gained[itype],
+-				    sd->lb_nobusyq[itype],
+-				    sd->lb_nobusyg[itype]);
+-			}
+-			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
+-			    " %lu %lu %lu\n",
+-			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
+-			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+-			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
+-			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
+-			    sd->ttwu_move_balance);
+-		}
+-		preempt_enable();
+-#endif
+-	}
+-	return 0;
+-}
+-
+-static int schedstat_open(struct inode *inode, struct file *file)
+-{
+-	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+-	char *buf = kmalloc(size, GFP_KERNEL);
+-	struct seq_file *m;
+-	int res;
+-
+-	if (!buf)
+-		return -ENOMEM;
+-	res = single_open(file, show_schedstat, NULL);
+-	if (!res) {
+-		m = file->private_data;
+-		m->buf = buf;
+-		m->size = size;
+-	} else
+-		kfree(buf);
+-	return res;
+-}
+-
+-const struct file_operations proc_schedstat_operations = {
+-	.open    = schedstat_open,
+-	.read    = seq_read,
+-	.llseek  = seq_lseek,
+-	.release = single_release,
+-};
+-
+-/*
+- * Expects runqueue lock to be held for atomicity of update
+- */
+-static inline void
+-rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+-{
+-	if (rq) {
+-		rq->rq_sched_info.run_delay += delta_jiffies;
+-		rq->rq_sched_info.pcnt++;
+-	}
+-}
+-
+-/*
+- * Expects runqueue lock to be held for atomicity of update
+- */
+-static inline void
+-rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+-{
+-	if (rq)
+-		rq->rq_sched_info.cpu_time += delta_jiffies;
+-}
+-# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
+-# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
+-#else /* !CONFIG_SCHEDSTATS */
+-static inline void
+-rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+-{}
+-static inline void
+-rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+-{}
+-# define schedstat_inc(rq, field)	do { } while (0)
+-# define schedstat_add(rq, field, amt)	do { } while (0)
+-#endif
+-
+ /*
+  * this_rq_lock - lock this runqueue and disable interrupts.
+  */
+@@ -566,178 +467,60 @@ static inline struct rq *this_rq_lock(vo
+ 	return rq;
+ }
+ 
+-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+-/*
+- * Called when a process is dequeued from the active array and given
+- * the cpu.  We should note that with the exception of interactive
+- * tasks, the expired queue will become the active queue after the active
+- * queue is empty, without explicitly dequeuing and requeuing tasks in the
+- * expired queue.  (Interactive tasks may be requeued directly to the
+- * active queue, thus delaying tasks in the expired queue from running;
+- * see scheduler_tick()).
+- *
+- * This function is only called from sched_info_arrive(), rather than
+- * dequeue_task(). Even though a task may be queued and dequeued multiple
+- * times as it is shuffled about, we're really interested in knowing how
+- * long it was from the *first* time it was queued to the time that it
+- * finally hit a cpu.
+- */
+-static inline void sched_info_dequeued(struct task_struct *t)
+-{
+-	t->sched_info.last_queued = 0;
+-}
+-
+ /*
+- * Called when a task finally hits the cpu.  We can now calculate how
+- * long it was waiting to run.  We also note when it began so that we
+- * can keep stats on how long its timeslice is.
++ * CPU frequency is/was unstable - start new by setting prev_clock_raw:
+  */
+-static void sched_info_arrive(struct task_struct *t)
++void sched_clock_unstable_event(void)
+ {
+-	unsigned long now = jiffies, delta_jiffies = 0;
+-
+-	if (t->sched_info.last_queued)
+-		delta_jiffies = now - t->sched_info.last_queued;
+-	sched_info_dequeued(t);
+-	t->sched_info.run_delay += delta_jiffies;
+-	t->sched_info.last_arrival = now;
+-	t->sched_info.pcnt++;
++	unsigned long flags;
++	struct rq *rq;
+ 
+-	rq_sched_info_arrive(task_rq(t), delta_jiffies);
++	rq = task_rq_lock(current, &flags);
++	rq->prev_clock_raw = sched_clock();
++	rq->clock_unstable_events++;
++	task_rq_unlock(rq, &flags);
+ }
+ 
+ /*
+- * Called when a process is queued into either the active or expired
+- * array.  The time is noted and later used to determine how long we
+- * had to wait for us to reach the cpu.  Since the expired queue will
+- * become the active queue after active queue is empty, without dequeuing
+- * and requeuing any tasks, we are interested in queuing to either. It
+- * is unusual but not impossible for tasks to be dequeued and immediately
+- * requeued in the same or another array: this can happen in sched_yield(),
+- * set_user_nice(), and even load_balance() as it moves tasks from runqueue
+- * to runqueue.
++ * resched_task - mark a task 'to be rescheduled now'.
+  *
+- * This function is only called from enqueue_task(), but also only updates
+- * the timestamp if it is already not set.  It's assumed that
+- * sched_info_dequeued() will clear that stamp when appropriate.
+- */
+-static inline void sched_info_queued(struct task_struct *t)
+-{
+-	if (unlikely(sched_info_on()))
+-		if (!t->sched_info.last_queued)
+-			t->sched_info.last_queued = jiffies;
+-}
+-
+-/*
+- * Called when a process ceases being the active-running process, either
+- * voluntarily or involuntarily.  Now we can calculate how long we ran.
++ * On UP this means the setting of the need_resched flag, on SMP it
++ * might also involve a cross-CPU call to trigger the scheduler on
++ * the target CPU.
+  */
+-static inline void sched_info_depart(struct task_struct *t)
+-{
+-	unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
++#ifdef CONFIG_SMP
+ 
+-	t->sched_info.cpu_time += delta_jiffies;
+-	rq_sched_info_depart(task_rq(t), delta_jiffies);
+-}
++#ifndef tsk_is_polling
++#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
++#endif
+ 
+-/*
+- * Called when tasks are switched involuntarily due, typically, to expiring
+- * their time slice.  (This may also be called when switching to or from
+- * the idle task.)  We are only called when prev != next.
+- */
+-static inline void
+-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
++static void resched_task(struct task_struct *p)
+ {
+-	struct rq *rq = task_rq(prev);
+-
+-	/*
+-	 * prev now departs the cpu.  It's not interesting to record
+-	 * stats about how efficient we were at scheduling the idle
+-	 * process, however.
+-	 */
+-	if (prev != rq->idle)
+-		sched_info_depart(prev);
++	int cpu;
+ 
+-	if (next != rq->idle)
+-		sched_info_arrive(next);
+-}
+-static inline void
+-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+-{
+-	if (unlikely(sched_info_on()))
+-		__sched_info_switch(prev, next);
+-}
+-#else
+-#define sched_info_queued(t)		do { } while (0)
+-#define sched_info_switch(t, next)	do { } while (0)
+-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
++	assert_spin_locked(&task_rq(p)->lock);
+ 
+-/*
+- * Adding/removing a task to/from a priority array:
+- */
+-static void dequeue_task(struct task_struct *p, struct prio_array *array)
+-{
+-	array->nr_active--;
+-	list_del(&p->run_list);
+-	if (list_empty(array->queue + p->prio))
+-		__clear_bit(p->prio, array->bitmap);
+-}
++	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
++		return;
+ 
+-static void enqueue_task(struct task_struct *p, struct prio_array *array)
+-{
+-	sched_info_queued(p);
+-	list_add_tail(&p->run_list, array->queue + p->prio);
+-	__set_bit(p->prio, array->bitmap);
+-	array->nr_active++;
+-	p->array = array;
+-}
++	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+ 
+-/*
+- * Put task to the end of the run list without the overhead of dequeue
+- * followed by enqueue.
+- */
+-static void requeue_task(struct task_struct *p, struct prio_array *array)
+-{
+-	list_move_tail(&p->run_list, array->queue + p->prio);
+-}
++	cpu = task_cpu(p);
++	if (cpu == smp_processor_id())
++		return;
+ 
+-static inline void
+-enqueue_task_head(struct task_struct *p, struct prio_array *array)
+-{
+-	list_add(&p->run_list, array->queue + p->prio);
+-	__set_bit(p->prio, array->bitmap);
+-	array->nr_active++;
+-	p->array = array;
++	/* NEED_RESCHED must be visible before we test polling */
++	smp_mb();
++	if (!tsk_is_polling(p))
++		smp_send_reschedule(cpu);
+ }
+-
+-/*
+- * __normal_prio - return the priority that is based on the static
+- * priority but is modified by bonuses/penalties.
+- *
+- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+- * into the -5 ... 0 ... +5 bonus/penalty range.
+- *
+- * We use 25% of the full 0...39 priority range so that:
+- *
+- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+- *
+- * Both properties are important to certain workloads.
+- */
+-
+-static inline int __normal_prio(struct task_struct *p)
++#else
++static inline void resched_task(struct task_struct *p)
+ {
+-	int bonus, prio;
+-
+-	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+-
+-	prio = p->static_prio - bonus;
+-	if (prio < MAX_RT_PRIO)
+-		prio = MAX_RT_PRIO;
+-	if (prio > MAX_PRIO-1)
+-		prio = MAX_PRIO-1;
+-	return prio;
++	assert_spin_locked(&task_rq(p)->lock);
++	set_tsk_need_resched(p);
+ }
++#endif
+ 
+ /*
+  * To aid in avoiding the subversion of "niceness" due to uneven distribution
+@@ -761,22 +544,33 @@ static inline int __normal_prio(struct t
+ #define RTPRIO_TO_LOAD_WEIGHT(rp) \
+ 	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+ 
++/*
++ * Nice levels are logarithmic. These are the load shifts assigned
++ * to nice levels, where a step of every 2 nice levels means a
++ * multiplicator of 2:
++ */
++const int prio_to_load_shift[40] = {
++/* -20 */ 20, 19, 19, 18, 18, 17, 17, 16, 16, 15,
++/* -10 */ 15, 14, 14, 13, 13, 12, 12, 11, 11, 10,
++/*   0 */ 10,  9,  9,  8,  8,  7,  7,  6,  6,  5,
++/*  10 */  5,  4,  4,  3,  3,  2,  2,  1,  1,  0
++};
++
++static int get_load_shift(struct task_struct *p)
++{
++	int prio = p->static_prio;
++
++	if (rt_prio(prio) || p->policy == SCHED_BATCH)
++		return 0;
++
++	return prio_to_load_shift[prio - MAX_RT_PRIO];
++}
++
+ static void set_load_weight(struct task_struct *p)
+ {
+-	if (has_rt_policy(p)) {
+-#ifdef CONFIG_SMP
+-		if (p == task_rq(p)->migration_thread)
+-			/*
+-			 * The migration thread does the actual balancing.
+-			 * Giving its load any weight will skew balancing
+-			 * adversely.
+-			 */
+-			p->load_weight = 0;
+-		else
+-#endif
+-			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+-	} else
+-		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
++	p->load_shift = get_load_shift(p);
++	p->load_weight = 1 << p->load_shift;
++	p->wait_runtime = 0;
+ }
+ 
+ static inline void
+@@ -803,6 +597,40 @@ static inline void dec_nr_running(struct
+ 	dec_raw_weighted_load(rq, p);
+ }
+ 
++static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
++
++#include "sched_stats.h"
++#include "sched_rt.c"
++#include "sched_fair.c"
++#include "sched_debug.c"
++
++#define sched_class_highest (&rt_sched_class)
++
++static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
++{
++	u64 now = rq_clock(rq);
++
++	sched_info_queued(p);
++	p->sched_class->enqueue_task(rq, p, wakeup, now);
++	p->on_rq = 1;
++}
++
++static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
++{
++	u64 now = rq_clock(rq);
++
++	p->sched_class->dequeue_task(rq, p, sleep, now);
++	p->on_rq = 0;
++}
++
++/*
++ * __normal_prio - return the priority that is based on the static prio
++ */
++static inline int __normal_prio(struct task_struct *p)
++{
++	return p->static_prio;
++}
++
+ /*
+  * Calculate the expected normal priority: i.e. priority
+  * without taking RT-inheritance into account. Might be
+@@ -842,210 +670,31 @@ static int effective_prio(struct task_st
+ }
+ 
+ /*
+- * __activate_task - move a task to the runqueue.
++ * activate_task - move a task to the runqueue.
+  */
+-static void __activate_task(struct task_struct *p, struct rq *rq)
++static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+ {
+-	struct prio_array *target = rq->active;
+-
+-	if (batch_task(p))
+-		target = rq->expired;
+-	enqueue_task(p, target);
++	enqueue_task(rq, p, wakeup);
+ 	inc_nr_running(p, rq);
+ }
+ 
+ /*
+- * __activate_idle_task - move idle task to the _front_ of runqueue.
++ * activate_idle_task - move idle task to the _front_ of runqueue.
+  */
+-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
++static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
+ {
+-	enqueue_task_head(p, rq->active);
++	enqueue_task(rq, p, 0);
+ 	inc_nr_running(p, rq);
+ }
+ 
+ /*
+- * Recalculate p->normal_prio and p->prio after having slept,
+- * updating the sleep-average too:
+- */
+-static int recalc_task_prio(struct task_struct *p, unsigned long long now)
+-{
+-	/* Caller must always ensure 'now >= p->timestamp' */
+-	unsigned long sleep_time = now - p->timestamp;
+-
+-	if (batch_task(p))
+-		sleep_time = 0;
+-
+-	if (likely(sleep_time > 0)) {
+-		/*
+-		 * This ceiling is set to the lowest priority that would allow
+-		 * a task to be reinserted into the active array on timeslice
+-		 * completion.
+-		 */
+-		unsigned long ceiling = INTERACTIVE_SLEEP(p);
+-
+-		if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
+-			/*
+-			 * Prevents user tasks from achieving best priority
+-			 * with one single large enough sleep.
+-			 */
+-			p->sleep_avg = ceiling;
+-			/*
+-			 * Using INTERACTIVE_SLEEP() as a ceiling places a
+-			 * nice(0) task 1ms sleep away from promotion, and
+-			 * gives it 700ms to round-robin with no chance of
+-			 * being demoted.  This is more than generous, so
+-			 * mark this sleep as non-interactive to prevent the
+-			 * on-runqueue bonus logic from intervening should
+-			 * this task not receive cpu immediately.
+-			 */
+-			p->sleep_type = SLEEP_NONINTERACTIVE;
+-		} else {
+-			/*
+-			 * Tasks waking from uninterruptible sleep are
+-			 * limited in their sleep_avg rise as they
+-			 * are likely to be waiting on I/O
+-			 */
+-			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
+-				if (p->sleep_avg >= ceiling)
+-					sleep_time = 0;
+-				else if (p->sleep_avg + sleep_time >=
+-					 ceiling) {
+-						p->sleep_avg = ceiling;
+-						sleep_time = 0;
+-				}
+-			}
+-
+-			/*
+-			 * This code gives a bonus to interactive tasks.
+-			 *
+-			 * The boost works by updating the 'average sleep time'
+-			 * value here, based on ->timestamp. The more time a
+-			 * task spends sleeping, the higher the average gets -
+-			 * and the higher the priority boost gets as well.
+-			 */
+-			p->sleep_avg += sleep_time;
+-
+-		}
+-		if (p->sleep_avg > NS_MAX_SLEEP_AVG)
+-			p->sleep_avg = NS_MAX_SLEEP_AVG;
+-	}
+-
+-	return effective_prio(p);
+-}
+-
+-/*
+- * activate_task - mov