summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Juszkiewicz <marcin@juszkiewicz.com.pl>2009-10-21 12:22:20 +0200
committerMarcin Juszkiewicz <marcin@juszkiewicz.com.pl>2009-12-14 11:34:58 +0100
commit9d1b79b7848e13e1bf80b736671f76144cc508d4 (patch)
treec5e1ff0f4b3c2f08444666fdf8348aa8610fc98d
parent575cf43aa9df4192aa9125258545e7943a45f4d5 (diff)
linux 2.6.23: keep sched-cfs locally updated to 2.6.23.17
-rw-r--r--recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch8567
-rw-r--r--recipes/linux/linux_2.6.23.bb4
2 files changed, 8569 insertions, 2 deletions
diff --git a/recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch b/recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch
new file mode 100644
index 0000000000..77ee5c8f1d
--- /dev/null
+++ b/recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch
@@ -0,0 +1,8567 @@
+---
+ Documentation/sched-design-CFS.txt | 67 +
+ Makefile | 2
+ arch/i386/Kconfig | 11
+ drivers/kvm/kvm.h | 10
+ fs/pipe.c | 9
+ fs/proc/array.c | 21
+ fs/proc/base.c | 2
+ fs/proc/proc_misc.c | 15
+ include/linux/cgroup.h | 12
+ include/linux/cpuset.h | 5
+ include/linux/kernel.h | 7
+ include/linux/kernel_stat.h | 3
+ include/linux/nodemask.h | 94 +
+ include/linux/sched.h | 174 ++
+ include/linux/taskstats.h | 7
+ include/linux/topology.h | 5
+ init/Kconfig | 26
+ init/main.c | 3
+ kernel/delayacct.c | 8
+ kernel/exit.c | 6
+ kernel/fork.c | 5
+ kernel/ksysfs.c | 8
+ kernel/sched.c | 2310 +++++++++++++++++++++++--------------
+ kernel/sched_debug.c | 289 +++-
+ kernel/sched_fair.c | 885 ++++++--------
+ kernel/sched_idletask.c | 26
+ kernel/sched_rt.c | 54
+ kernel/sched_stats.h | 40
+ kernel/sysctl.c | 40
+ kernel/timer.c | 7
+ kernel/tsacct.c | 4
+ kernel/user.c | 249 +++
+ mm/memory_hotplug.c | 7
+ mm/page_alloc.c | 50
+ mm/vmscan.c | 4
+ net/unix/af_unix.c | 4
+ 36 files changed, 2883 insertions(+), 1586 deletions(-)
+
+--- linux-2.6.23.orig/Documentation/sched-design-CFS.txt
++++ linux-2.6.23/Documentation/sched-design-CFS.txt
+@@ -115,5 +115,72 @@ Some implementation details:
+ - reworked/sanitized SMP load-balancing: the runqueue-walking
+ assumptions are gone from the load-balancing code now, and
+ iterators of the scheduling modules are used. The balancing code got
+ quite a bit simpler as a result.
+
++
++Group scheduler extension to CFS
++================================
++
++Normally the scheduler operates on individual tasks and strives to provide
++fair CPU time to each task. Sometimes, it may be desirable to group tasks
++and provide fair CPU time to each such task group. For example, it may
++be desirable to first provide fair CPU time to each user on the system
++and then to each task belonging to a user.
++
++CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
++SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
++groups. At present, there are two (mutually exclusive) mechanisms to group
++tasks for CPU bandwidth control purpose:
++
++ - Based on user id (CONFIG_FAIR_USER_SCHED)
++ In this option, tasks are grouped according to their user id.
++ - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
++ This options lets the administrator create arbitrary groups
++ of tasks, using the "cgroup" pseudo filesystem. See
++ Documentation/cgroups.txt for more information about this
++ filesystem.
++
++Only one of these options to group tasks can be chosen and not both.
++
++Group scheduler tunables:
++
++When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
++each new user and a "cpu_share" file is added in that directory.
++
++ # cd /sys/kernel/uids
++ # cat 512/cpu_share # Display user 512's CPU share
++ 1024
++ # echo 2048 > 512/cpu_share # Modify user 512's CPU share
++ # cat 512/cpu_share # Display user 512's CPU share
++ 2048
++ #
++
++CPU bandwidth between two users are divided in the ratio of their CPU shares.
++For ex: if you would like user "root" to get twice the bandwidth of user
++"guest", then set the cpu_share for both the users such that "root"'s
++cpu_share is twice "guest"'s cpu_share
++
++
++When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
++for each group created using the pseudo filesystem. See example steps
++below to create task groups and modify their CPU share using the "cgroups"
++pseudo filesystem
++
++ # mkdir /dev/cpuctl
++ # mount -t cgroup -ocpu none /dev/cpuctl
++ # cd /dev/cpuctl
++
++ # mkdir multimedia # create "multimedia" group of tasks
++ # mkdir browser # create "browser" group of tasks
++
++ # #Configure the multimedia group to receive twice the CPU bandwidth
++ # #that of browser group
++
++ # echo 2048 > multimedia/cpu.shares
++ # echo 1024 > browser/cpu.shares
++
++ # firefox & # Launch firefox and move it to "browser" group
++ # echo <firefox_pid> > browser/tasks
++
++ # #Launch gmplayer (or your favourite movie player)
++ # echo <movie_player_pid> > multimedia/tasks
+--- linux-2.6.23.orig/Makefile
++++ linux-2.6.23/Makefile
+@@ -1,9 +1,9 @@
+ VERSION = 2
+ PATCHLEVEL = 6
+ SUBLEVEL = 23
+-EXTRAVERSION = .17
++EXTRAVERSION = .17-cfs-v24.1
+ NAME = Arr Matey! A Hairy Bilge Rat!
+
+ # *DOCUMENTATION*
+ # To see a list of typical targets execute "make help"
+ # More info can be located in ./README
+--- linux-2.6.23.orig/arch/i386/Kconfig
++++ linux-2.6.23/arch/i386/Kconfig
+@@ -212,10 +212,21 @@ config X86_ES7000
+ Only choose this option if you have such a system, otherwise you
+ should say N here.
+
+ endchoice
+
++config SCHED_NO_NO_OMIT_FRAME_POINTER
++ bool "Single-depth WCHAN output"
++ default y
++ help
++ Calculate simpler /proc/<PID>/wchan values. If this option
++ is disabled then wchan values will recurse back to the
++ caller function. This provides more accurate wchan values,
++ at the expense of slightly more scheduling overhead.
++
++ If in doubt, say "Y".
++
+ config PARAVIRT
+ bool "Paravirtualization support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ depends on !(X86_VISWS || X86_VOYAGER)
+ help
+--- linux-2.6.23.orig/drivers/kvm/kvm.h
++++ linux-2.6.23/drivers/kvm/kvm.h
+@@ -623,10 +623,20 @@ void __kvm_mmu_free_some_pages(struct kv
+ int kvm_mmu_load(struct kvm_vcpu *vcpu);
+ void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+
+ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
+
++static inline void kvm_guest_enter(void)
++{
++ current->flags |= PF_VCPU;
++}
++
++static inline void kvm_guest_exit(void)
++{
++ current->flags &= ~PF_VCPU;
++}
++
+ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+ u32 error_code)
+ {
+ return vcpu->mmu.page_fault(vcpu, gva, error_code);
+ }
+--- linux-2.6.23.orig/fs/pipe.c
++++ linux-2.6.23/fs/pipe.c
+@@ -43,12 +43,11 @@ void pipe_wait(struct pipe_inode_info *p
+
+ /*
+ * Pipes are system-local resources, so sleeping on them
+ * is considered a noninteractive wait:
+ */
+- prepare_to_wait(&pipe->wait, &wait,
+- TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
++ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
+ if (pipe->inode)
+ mutex_unlock(&pipe->inode->i_mutex);
+ schedule();
+ finish_wait(&pipe->wait, &wait);
+ if (pipe->inode)
+@@ -381,11 +380,11 @@ redo:
+ }
+ mutex_unlock(&inode->i_mutex);
+
+ /* Signal writers asynchronously that there is more room. */
+ if (do_wakeup) {
+- wake_up_interruptible(&pipe->wait);
++ wake_up_interruptible_sync(&pipe->wait);
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+ }
+ if (ret > 0)
+ file_accessed(filp);
+ return ret;
+@@ -554,11 +553,11 @@ redo2:
+ pipe->waiting_writers--;
+ }
+ out:
+ mutex_unlock(&inode->i_mutex);
+ if (do_wakeup) {
+- wake_up_interruptible(&pipe->wait);
++ wake_up_interruptible_sync(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ }
+ if (ret > 0)
+ file_update_time(filp);
+ return ret;
+@@ -648,11 +647,11 @@ pipe_release(struct inode *inode, int de
+ pipe->writers -= decw;
+
+ if (!pipe->readers && !pipe->writers) {
+ free_pipe_info(inode);
+ } else {
+- wake_up_interruptible(&pipe->wait);
++ wake_up_interruptible_sync(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+ }
+ mutex_unlock(&inode->i_mutex);
+
+--- linux-2.6.23.orig/fs/proc/array.c
++++ linux-2.6.23/fs/proc/array.c
+@@ -365,15 +365,22 @@ static cputime_t task_stime(struct task_
+ * grows monotonically - apps rely on that):
+ */
+ stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+ cputime_to_clock_t(task_utime(p));
+
+- p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
++ if (stime >= 0)
++ p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
++
+ return p->prev_stime;
+ }
+ #endif
+
++static cputime_t task_gtime(struct task_struct *p)
++{
++ return p->gtime;
++}
++
+ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
+ {
+ unsigned long vsize, eip, esp, wchan = ~0UL;
+ long priority, nice;
+ int tty_pgrp = -1, tty_nr = 0;
+@@ -385,10 +392,11 @@ static int do_task_stat(struct task_stru
+ struct mm_struct *mm;
+ unsigned long long start_time;
+ unsigned long cmin_flt = 0, cmaj_flt = 0;
+ unsigned long min_flt = 0, maj_flt = 0;
+ cputime_t cutime, cstime, utime, stime;
++ cputime_t cgtime, gtime;
+ unsigned long rsslim = 0;
+ char tcomm[sizeof(task->comm)];
+ unsigned long flags;
+
+ state = *get_task_state(task);
+@@ -403,10 +411,11 @@ static int do_task_stat(struct task_stru
+ get_task_comm(tcomm, task);
+
+ sigemptyset(&sigign);
+ sigemptyset(&sigcatch);
+ cutime = cstime = utime = stime = cputime_zero;
++ cgtime = gtime = cputime_zero;
+
+ rcu_read_lock();
+ if (lock_task_sighand(task, &flags)) {
+ struct signal_struct *sig = task->signal;
+
+@@ -420,27 +429,30 @@ static int do_task_stat(struct task_stru
+
+ cmin_flt = sig->cmin_flt;
+ cmaj_flt = sig->cmaj_flt;
+ cutime = sig->cutime;
+ cstime = sig->cstime;
++ cgtime = sig->cgtime;
+ rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
+
+ /* add up live thread stats at the group level */
+ if (whole) {
+ struct task_struct *t = task;
+ do {
+ min_flt += t->min_flt;
+ maj_flt += t->maj_flt;
+ utime = cputime_add(utime, task_utime(t));
+ stime = cputime_add(stime, task_stime(t));
++ gtime = cputime_add(gtime, task_gtime(t));
+ t = next_thread(t);
+ } while (t != task);
+
+ min_flt += sig->min_flt;
+ maj_flt += sig->maj_flt;
+ utime = cputime_add(utime, sig->utime);
+ stime = cputime_add(stime, sig->stime);
++ gtime = cputime_add(gtime, sig->gtime);
+ }
+
+ sid = signal_session(sig);
+ pgid = process_group(task);
+ ppid = rcu_dereference(task->real_parent)->tgid;
+@@ -454,10 +466,11 @@ static int do_task_stat(struct task_stru
+ if (!whole) {
+ min_flt = task->min_flt;
+ maj_flt = task->maj_flt;
+ utime = task_utime(task);
+ stime = task_stime(task);
++ gtime = task_gtime(task);
+ }
+
+ /* scale priority and nice values from timeslices to -20..20 */
+ /* to make it look like a "normal" Unix priority/nice value */
+ priority = task_prio(task);
+@@ -471,11 +484,11 @@ static int do_task_stat(struct task_stru
+ /* convert nsec -> ticks */
+ start_time = nsec_to_clock_t(start_time);
+
+ res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
+ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
+-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n",
++%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
+ task->pid,
+ tcomm,
+ state,
+ ppid,
+ pgid,
+@@ -516,11 +529,13 @@ static int do_task_stat(struct task_stru
+ 0UL,
+ task->exit_signal,
+ task_cpu(task),
+ task->rt_priority,
+ task->policy,
+- (unsigned long long)delayacct_blkio_ticks(task));
++ (unsigned long long)delayacct_blkio_ticks(task),
++ cputime_to_clock_t(gtime),
++ cputime_to_clock_t(cgtime));
+ if (mm)
+ mmput(mm);
+ return res;
+ }
+
+--- linux-2.6.23.orig/fs/proc/base.c
++++ linux-2.6.23/fs/proc/base.c
+@@ -302,11 +302,11 @@ static int proc_pid_wchan(struct task_st
+ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
+ {
+ return sprintf(buffer, "%llu %llu %lu\n",
+ task->sched_info.cpu_time,
+ task->sched_info.run_delay,
+- task->sched_info.pcnt);
++ task->sched_info.pcount);
+ }
+ #endif
+
+ /* The badness from the OOM killer */
+ unsigned long badness(struct task_struct *p, unsigned long uptime);
+--- linux-2.6.23.orig/fs/proc/proc_misc.c
++++ linux-2.6.23/fs/proc/proc_misc.c
+@@ -441,20 +441,22 @@ static const struct file_operations proc
+ static int show_stat(struct seq_file *p, void *v)
+ {
+ int i;
+ unsigned long jif;
+ cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
++ cputime64_t guest;
+ u64 sum = 0;
+ struct timespec boottime;
+ unsigned int *per_irq_sum;
+
+ per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL);
+ if (!per_irq_sum)
+ return -ENOMEM;
+
+ user = nice = system = idle = iowait =
+ irq = softirq = steal = cputime64_zero;
++ guest = cputime64_zero;
+ getboottime(&boottime);
+ jif = boottime.tv_sec;
+
+ for_each_possible_cpu(i) {
+ int j;
+@@ -465,26 +467,28 @@ static int show_stat(struct seq_file *p,
+ idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
+ iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
+ irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
+ softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
+ steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
++ guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+ for (j = 0; j < NR_IRQS; j++) {
+ unsigned int temp = kstat_cpu(i).irqs[j];
+ sum += temp;
+ per_irq_sum[j] += temp;
+ }
+ }
+
+- seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n",
++ seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ (unsigned long long)cputime64_to_clock_t(user),
+ (unsigned long long)cputime64_to_clock_t(nice),
+ (unsigned long long)cputime64_to_clock_t(system),
+ (unsigned long long)cputime64_to_clock_t(idle),
+ (unsigned long long)cputime64_to_clock_t(iowait),
+ (unsigned long long)cputime64_to_clock_t(irq),
+ (unsigned long long)cputime64_to_clock_t(softirq),
+- (unsigned long long)cputime64_to_clock_t(steal));
++ (unsigned long long)cputime64_to_clock_t(steal),
++ (unsigned long long)cputime64_to_clock_t(guest));
+ for_each_online_cpu(i) {
+
+ /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
+ user = kstat_cpu(i).cpustat.user;
+ nice = kstat_cpu(i).cpustat.nice;
+@@ -492,20 +496,23 @@ static int show_stat(struct seq_file *p,
+ idle = kstat_cpu(i).cpustat.idle;
+ iowait = kstat_cpu(i).cpustat.iowait;
+ irq = kstat_cpu(i).cpustat.irq;
+ softirq = kstat_cpu(i).cpustat.softirq;
+ steal = kstat_cpu(i).cpustat.steal;
+- seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n",
++ guest = kstat_cpu(i).cpustat.guest;
++ seq_printf(p,
++ "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ i,
+ (unsigned long long)cputime64_to_clock_t(user),
+ (unsigned long long)cputime64_to_clock_t(nice),
+ (unsigned long long)cputime64_to_clock_t(system),
+ (unsigned long long)cputime64_to_clock_t(idle),
+ (unsigned long long)cputime64_to_clock_t(iowait),
+ (unsigned long long)cputime64_to_clock_t(irq),
+ (unsigned long long)cputime64_to_clock_t(softirq),
+- (unsigned long long)cputime64_to_clock_t(steal));
++ (unsigned long long)cputime64_to_clock_t(steal),
++ (unsigned long long)cputime64_to_clock_t(guest));
+ }
+ seq_printf(p, "intr %llu", (unsigned long long)sum);
+
+ #ifndef CONFIG_SMP
+ /* Touches too many cache lines on SMP setups */
+--- /dev/null
++++ linux-2.6.23/include/linux/cgroup.h
+@@ -0,0 +1,12 @@
++#ifndef _LINUX_CGROUP_H
++#define _LINUX_CGROUP_H
++
++/*
++ * Control groups are not backported - we use a few compatibility
++ * defines to be able to use the upstream sched.c as-is:
++ */
++#define task_pid_nr(task) (task)->pid
++#define task_pid_vnr(task) (task)->pid
++#define find_task_by_vpid(pid) find_task_by_pid(pid)
++
++#endif
+--- linux-2.6.23.orig/include/linux/cpuset.h
++++ linux-2.6.23/include/linux/cpuset.h
+@@ -144,8 +144,13 @@ static inline int cpuset_do_slab_mem_spr
+ return 0;
+ }
+
+ static inline void cpuset_track_online_nodes(void) {}
+
++static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p)
++{
++ return cpu_possible_map;
++}
++
+ #endif /* !CONFIG_CPUSETS */
+
+ #endif /* _LINUX_CPUSET_H */
+--- linux-2.6.23.orig/include/linux/kernel.h
++++ linux-2.6.23/include/linux/kernel.h
+@@ -59,10 +59,17 @@ extern const char linux_proc_banner[];
+ #define KERN_WARNING "<4>" /* warning conditions */
+ #define KERN_NOTICE "<5>" /* normal but significant condition */
+ #define KERN_INFO "<6>" /* informational */
+ #define KERN_DEBUG "<7>" /* debug-level messages */
+
++/*
++ * Annotation for a "continued" line of log printout (only done after a
++ * line that had no enclosing \n). Only to be used by core/arch code
++ * during early bootup (a continued line is not SMP-safe otherwise).
++ */
++#define KERN_CONT ""
++
+ extern int console_printk[];
+
+ #define console_loglevel (console_printk[0])
+ #define default_message_loglevel (console_printk[1])
+ #define minimum_console_loglevel (console_printk[2])
+--- linux-2.6.23.orig/include/linux/kernel_stat.h
++++ linux-2.6.23/include/linux/kernel_stat.h
+@@ -21,10 +21,11 @@ struct cpu_usage_stat {
+ cputime64_t softirq;
+ cputime64_t irq;
+ cputime64_t idle;
+ cputime64_t iowait;
+ cputime64_t steal;
++ cputime64_t guest;
+ };
+
+ struct kernel_stat {
+ struct cpu_usage_stat cpustat;
+ unsigned int irqs[NR_IRQS];
+@@ -50,9 +51,11 @@ static inline int kstat_irqs(int irq)
+
+ return sum;
+ }
+
+ extern void account_user_time(struct task_struct *, cputime_t);
++extern void account_user_time_scaled(struct task_struct *, cputime_t);
+ extern void account_system_time(struct task_struct *, int, cputime_t);
++extern void account_system_time_scaled(struct task_struct *, cputime_t);
+ extern void account_steal_time(struct task_struct *, cputime_t);
+
+ #endif /* _LINUX_KERNEL_STAT_H */
+--- linux-2.6.23.orig/include/linux/nodemask.h
++++ linux-2.6.23/include/linux/nodemask.h
+@@ -336,46 +336,108 @@ static inline void __nodes_remap(nodemas
+ if (!nodes_empty(mask)) \
+ for ((node) = 0; (node) < 1; (node)++)
+ #endif /* MAX_NUMNODES */
+
+ /*
++ * Bitmasks that are kept for all the nodes.
++ */
++enum node_states {
++ N_POSSIBLE, /* The node could become online at some point */
++ N_ONLINE, /* The node is online */
++ N_NORMAL_MEMORY, /* The node has regular memory */
++#ifdef CONFIG_HIGHMEM
++ N_HIGH_MEMORY, /* The node has regular or high memory */
++#else
++ N_HIGH_MEMORY = N_NORMAL_MEMORY,
++#endif
++ N_CPU, /* The node has one or more cpus */
++ NR_NODE_STATES
++};
++
++/*
+ * The following particular system nodemasks and operations
+ * on them manage all possible and online nodes.
+ */
+
+-extern nodemask_t node_online_map;
+-extern nodemask_t node_possible_map;
++extern nodemask_t node_states[NR_NODE_STATES];
+
+ #if MAX_NUMNODES > 1
+-#define num_online_nodes() nodes_weight(node_online_map)
+-#define num_possible_nodes() nodes_weight(node_possible_map)
+-#define node_online(node) node_isset((node), node_online_map)
+-#define node_possible(node) node_isset((node), node_possible_map)
+-#define first_online_node first_node(node_online_map)
+-#define next_online_node(nid) next_node((nid), node_online_map)
++static inline int node_state(int node, enum node_states state)
++{
++ return node_isset(node, node_states[state]);
++}
++
++static inline void node_set_state(int node, enum node_states state)
++{
++ __node_set(node, &node_states[state]);
++}
++
++static inline void node_clear_state(int node, enum node_states state)
++{
++ __node_clear(node, &node_states[state]);
++}
++
++static inline int num_node_state(enum node_states state)
++{
++ return nodes_weight(node_states[state]);
++}
++
++#define for_each_node_state(__node, __state) \
++ for_each_node_mask((__node), node_states[__state])
++
++#define first_online_node first_node(node_states[N_ONLINE])
++#define next_online_node(nid) next_node((nid), node_states[N_ONLINE])
++
+ extern int nr_node_ids;
+ #else
+-#define num_online_nodes() 1
+-#define num_possible_nodes() 1
+-#define node_online(node) ((node) == 0)
+-#define node_possible(node) ((node) == 0)
++
++static inline int node_state(int node, enum node_states state)
++{
++ return node == 0;
++}
++
++static inline void node_set_state(int node, enum node_states state)
++{
++}
++
++static inline void node_clear_state(int node, enum node_states state)
++{
++}
++
++static inline int num_node_state(enum node_states state)
++{
++ return 1;
++}
++
++#define for_each_node_state(node, __state) \
++ for ( (node) = 0; (node) == 0; (node) = 1)
++
+ #define first_online_node 0
+ #define next_online_node(nid) (MAX_NUMNODES)
+ #define nr_node_ids 1
++
+ #endif
+
++#define node_online_map node_states[N_ONLINE]
++#define node_possible_map node_states[N_POSSIBLE]
++
+ #define any_online_node(mask) \
+ ({ \
+ int node; \
+ for_each_node_mask(node, (mask)) \
+ if (node_online(node)) \
+ break; \
+ node; \
+ })
+
+-#define node_set_online(node) set_bit((node), node_online_map.bits)
+-#define node_set_offline(node) clear_bit((node), node_online_map.bits)
++#define num_online_nodes() num_node_state(N_ONLINE)
++#define num_possible_nodes() num_node_state(N_POSSIBLE)
++#define node_online(node) node_state((node), N_ONLINE)
++#define node_possible(node) node_state((node), N_POSSIBLE)
++
++#define node_set_online(node) node_set_state((node), N_ONLINE)
++#define node_set_offline(node) node_clear_state((node), N_ONLINE)
+
+-#define for_each_node(node) for_each_node_mask((node), node_possible_map)
+-#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
++#define for_each_node(node) for_each_node_state(node, N_POSSIBLE)
++#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
+
+ #endif /* __LINUX_NODEMASK_H */
+--- linux-2.6.23.orig/include/linux/sched.h
++++ linux-2.6.23/include/linux/sched.h
+@@ -1,10 +1,21 @@
+ #ifndef _LINUX_SCHED_H
+ #define _LINUX_SCHED_H
+
+ #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
+
++/* backporting helper macro: */
++#define cpu_sibling_map(cpu) cpu_sibling_map[cpu]
++
++/*
++ * * Control groups are not backported - we use a few compatibility
++ * * defines to be able to use the upstream sched.c as-is:
++ * */
++#define task_pid_nr(task) (task)->pid
++#define task_pid_vnr(task) (task)->pid
++#define find_task_by_vpid(pid) find_task_by_pid(pid)
++
+ /*
+ * cloning flags:
+ */
+ #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
+ #define CLONE_VM 0x00000100 /* set if VM shared between processes */
+@@ -84,10 +95,11 @@ struct sched_param {
+ #include <linux/param.h>
+ #include <linux/resource.h>
+ #include <linux/timer.h>
+ #include <linux/hrtimer.h>
+ #include <linux/task_io_accounting.h>
++#include <linux/kobject.h>
+
+ #include <asm/processor.h>
+
+ struct exec_domain;
+ struct futex_pi_state;
+@@ -133,10 +145,11 @@ extern unsigned long nr_active(void);
+ extern unsigned long nr_iowait(void);
+ extern unsigned long weighted_cpuload(const int cpu);
+
+ struct seq_file;
+ struct cfs_rq;
++struct task_group;
+ #ifdef CONFIG_SCHED_DEBUG
+ extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
+ extern void proc_sched_set_task(struct task_struct *p);
+ extern void
+ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+@@ -171,12 +184,11 @@ print_cfs_rq(struct seq_file *m, int cpu
+ #define TASK_TRACED 8
+ /* in tsk->exit_state */
+ #define EXIT_ZOMBIE 16
+ #define EXIT_DEAD 32
+ /* in tsk->state again */
+-#define TASK_NONINTERACTIVE 64
+-#define TASK_DEAD 128
++#define TASK_DEAD 64
+
+ #define __set_task_state(tsk, state_value) \
+ do { (tsk)->state = (state_value); } while (0)
+ #define set_task_state(tsk, state_value) \
+ set_mb((tsk)->state, (state_value))
+@@ -276,10 +288,14 @@ static inline void touch_all_softlockup_
+ #endif
+
+
+ /* Attach to any functions which should be ignored in wchan output. */
+ #define __sched __attribute__((__section__(".sched.text")))
++
++/* Linker adds these: start and end of __sched functions */
++extern char __sched_text_start[], __sched_text_end[];
++
+ /* Is this address in the __sched functions? */
+ extern int in_sched_functions(unsigned long addr);
+
+ #define MAX_SCHEDULE_TIMEOUT LONG_MAX
+ extern signed long FASTCALL(schedule_timeout(signed long timeout));
+@@ -513,10 +529,12 @@ struct signal_struct {
+ * and for reaped dead child processes forked by this group.
+ * Live threads maintain their own counters and add to these
+ * in __exit_signal, except for the group leader.
+ */
+ cputime_t utime, stime, cutime, cstime;
++ cputime_t gtime;
++ cputime_t cgtime;
+ unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
+ unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+ unsigned long inblock, oublock, cinblock, coublock;
+
+ /*
+@@ -593,12 +611,27 @@ struct user_struct {
+ #endif
+
+ /* Hash table maintenance information */
+ struct hlist_node uidhash_node;
+ uid_t uid;
++
++#ifdef CONFIG_FAIR_USER_SCHED
++ struct task_group *tg;
++#ifdef CONFIG_SYSFS
++ struct kset kset;
++ struct subsys_attribute user_attr;
++ struct work_struct work;
++#endif
++#endif
+ };
+
++#ifdef CONFIG_FAIR_USER_SCHED
++extern int uids_kobject_init(void);
++#else
++static inline int uids_kobject_init(void) { return 0; }
++#endif
++
+ extern struct user_struct *find_user(uid_t);
+
+ extern struct user_struct root_user;
+ #define INIT_USER (&root_user)
+
+@@ -606,17 +639,21 @@ struct backing_dev_info;
+ struct reclaim_state;
+
+ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ struct sched_info {
+ /* cumulative counters */
+- unsigned long pcnt; /* # of times run on this cpu */
++ unsigned long pcount; /* # of times run on this cpu */
+ unsigned long long cpu_time, /* time spent on the cpu */
+ run_delay; /* time spent waiting on a runqueue */
+
+ /* timestamps */
+ unsigned long long last_arrival,/* when we last ran on a cpu */
+ last_queued; /* when we were last queued to run */
++#ifdef CONFIG_SCHEDSTATS
++ /* BKL stats */
++ unsigned int bkl_count;
++#endif
+ };
+ #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
+
+ #ifdef CONFIG_SCHEDSTATS
+ extern const struct file_operations proc_schedstat_operations;
+@@ -747,43 +784,42 @@ struct sched_domain {
+ unsigned int balance_interval; /* initialise to 1. units in ms. */
+ unsigned int nr_balance_failed; /* initialise to 0 */
+
+ #ifdef CONFIG_SCHEDSTATS
+ /* load_balance() stats */
+- unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
+- unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
+- unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
+- unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
+- unsigned long lb_gained[CPU_MAX_IDLE_TYPES];
+- unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES];
+- unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES];
+- unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
++ unsigned int lb_count[CPU_MAX_IDLE_TYPES];
++ unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
++ unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
++ unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
++ unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
++ unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
++ unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
++ unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
+
+ /* Active load balancing */
+- unsigned long alb_cnt;
+- unsigned long alb_failed;
+- unsigned long alb_pushed;
++ unsigned int alb_count;
++ unsigned int alb_failed;
++ unsigned int alb_pushed;
+
+ /* SD_BALANCE_EXEC stats */
+- unsigned long sbe_cnt;
+- unsigned long sbe_balanced;
+- unsigned long sbe_pushed;
++ unsigned int sbe_count;
++ unsigned int sbe_balanced;
++ unsigned int sbe_pushed;
+
+ /* SD_BALANCE_FORK stats */
+- unsigned long sbf_cnt;
+- unsigned long sbf_balanced;
+- unsigned long sbf_pushed;
++ unsigned int sbf_count;
++ unsigned int sbf_balanced;
++ unsigned int sbf_pushed;
+
+ /* try_to_wake_up() stats */
+- unsigned long ttwu_wake_remote;
+- unsigned long ttwu_move_affine;
+- unsigned long ttwu_move_balance;
++ unsigned int ttwu_wake_remote;
++ unsigned int ttwu_move_affine;
++ unsigned int ttwu_move_balance;
+ #endif
+ };
+
+-extern int partition_sched_domains(cpumask_t *partition1,
+- cpumask_t *partition2);
++extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
+
+ #endif /* CONFIG_SMP */
+
+ /*
+ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
+@@ -851,27 +887,32 @@ struct uts_namespace;
+
+ struct rq;
+ struct sched_domain;
+
+ struct sched_class {
+- struct sched_class *next;
++ const struct sched_class *next;
+
+ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
+ void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
+- void (*yield_task) (struct rq *rq, struct task_struct *p);
++ void (*yield_task) (struct rq *rq);
+
+ void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
+
+ struct task_struct * (*pick_next_task) (struct rq *rq);
+ void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+
++#ifdef CONFIG_SMP
+ unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
+- struct rq *busiest,
+- unsigned long max_nr_move, unsigned long max_load_move,
++ struct rq *busiest, unsigned long max_load_move,
+ struct sched_domain *sd, enum cpu_idle_type idle,
+ int *all_pinned, int *this_best_prio);
+
++ int (*move_one_task) (struct rq *this_rq, int this_cpu,
++ struct rq *busiest, struct sched_domain *sd,
++ enum cpu_idle_type idle);
++#endif
++
+ void (*set_curr_task) (struct rq *rq);
+ void (*task_tick) (struct rq *rq, struct task_struct *p);
+ void (*task_new) (struct rq *rq, struct task_struct *p);
+ };
+
+@@ -885,46 +926,52 @@ struct load_weight {
+ * Current field usage histogram:
+ *
+ * 4 se->block_start
+ * 4 se->run_node
+ * 4 se->sleep_start
+- * 4 se->sleep_start_fair
+ * 6 se->load.weight
+- * 7 se->delta_fair
+- * 15 se->wait_runtime
+ */
+ struct sched_entity {
+- long wait_runtime;
+- unsigned long delta_fair_run;
+- unsigned long delta_fair_sleep;
+- unsigned long delta_exec;
+- s64 fair_key;
+ struct load_weight load; /* for load-balancing */
+ struct rb_node run_node;
+ unsigned int on_rq;
+
+ u64 exec_start;
+ u64 sum_exec_runtime;
++ u64 vruntime;
+ u64 prev_sum_exec_runtime;
+- u64 wait_start_fair;
+- u64 sleep_start_fair;
+
+ #ifdef CONFIG_SCHEDSTATS
+ u64 wait_start;
+ u64 wait_max;
+- s64 sum_wait_runtime;
+
+ u64 sleep_start;
+ u64 sleep_max;
+ s64 sum_sleep_runtime;
+
+ u64 block_start;
+ u64 block_max;
+ u64 exec_max;
++ u64 slice_max;
+
+- unsigned long wait_runtime_overruns;
+- unsigned long wait_runtime_underruns;
++ u64 nr_migrations;
++ u64 nr_migrations_cold;
++ u64 nr_failed_migrations_affine;
++ u64 nr_failed_migrations_running;
++ u64 nr_failed_migrations_hot;
++ u64 nr_forced_migrations;
++ u64 nr_forced2_migrations;
++
++ u64 nr_wakeups;
++ u64 nr_wakeups_sync;
++ u64 nr_wakeups_migrate;
++ u64 nr_wakeups_local;
++ u64 nr_wakeups_remote;
++ u64 nr_wakeups_affine;
++ u64 nr_wakeups_affine_attempts;
++ u64 nr_wakeups_passive;
++ u64 nr_wakeups_idle;
+ #endif
+
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *parent;
+ /* rq on which this entity is (to be) queued: */
+@@ -949,11 +996,11 @@ struct task_struct {
+ #endif
+ #endif
+
+ int prio, static_prio, normal_prio;
+ struct list_head run_list;
+- struct sched_class *sched_class;
++ const struct sched_class *sched_class;
+ struct sched_entity se;
+
+ #ifdef CONFIG_PREEMPT_NOTIFIERS
+ /* list of struct preempt_notifier: */
+ struct hlist_head preempt_notifiers;
+@@ -1019,11 +1066,12 @@ struct task_struct {
+ struct completion *vfork_done; /* for vfork() */
+ int __user *set_child_tid; /* CLONE_CHILD_SETTID */
+ int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
+
+ unsigned int rt_priority;
+- cputime_t utime, stime;
++ cputime_t utime, stime, utimescaled, stimescaled;
++ cputime_t gtime;
+ cputime_t prev_utime, prev_stime;
+ unsigned long nvcsw, nivcsw; /* context switch counts */
+ struct timespec start_time; /* monotonic time */
+ struct timespec real_start_time; /* boot based time */
+ /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
+@@ -1312,10 +1360,11 @@ static inline void put_task_struct(struc
+ #define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */
+ /* Not implemented yet, only for 486*/
+ #define PF_STARTING 0x00000002 /* being created */
+ #define PF_EXITING 0x00000004 /* getting shut down */
+ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
++#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
+ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
+ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
+ #define PF_DUMPCORE 0x00000200 /* dumped core */
+ #define PF_SIGNALED 0x00000400 /* killed by a signal */
+ #define PF_MEMALLOC 0x00000800 /* Allocating memory */
+@@ -1399,19 +1448,30 @@ extern void idle_task_exit(void);
+ static inline void idle_task_exit(void) {}
+ #endif
+
+ extern void sched_idle_next(void);
+
++#ifdef CONFIG_SCHED_DEBUG
+ extern unsigned int sysctl_sched_latency;
+ extern unsigned int sysctl_sched_min_granularity;
+ extern unsigned int sysctl_sched_wakeup_granularity;
+ extern unsigned int sysctl_sched_batch_wakeup_granularity;
+-extern unsigned int sysctl_sched_stat_granularity;
+-extern unsigned int sysctl_sched_runtime_limit;
+-extern unsigned int sysctl_sched_compat_yield;
+ extern unsigned int sysctl_sched_child_runs_first;
+ extern unsigned int sysctl_sched_features;
++extern unsigned int sysctl_sched_migration_cost;
++extern unsigned int sysctl_sched_nr_migrate;
++#ifdef CONFIG_FAIR_GROUP_SCHED
++extern unsigned int sysctl_sched_min_bal_int_shares;
++extern unsigned int sysctl_sched_max_bal_int_shares;
++#endif
++
++int sched_nr_latency_handler(struct ctl_table *table, int write,
++ struct file *file, void __user *buffer, size_t *length,
++ loff_t *ppos);
++#endif
++
++extern unsigned int sysctl_sched_compat_yield;
+
+ #ifdef CONFIG_RT_MUTEXES
+ extern int rt_mutex_getprio(struct task_struct *p);
+ extern void rt_mutex_setprio(struct task_struct *p, int prio);
+ extern void rt_mutex_adjust_pi(struct task_struct *p);
+@@ -1841,10 +1901,22 @@ extern long sched_getaffinity(pid_t pid,
+
+ extern int sched_mc_power_savings, sched_smt_power_savings;
+
+ extern void normalize_rt_tasks(void);
+
++#ifdef CONFIG_FAIR_GROUP_SCHED
++
++extern struct task_group init_task_group;