From 9d1b79b7848e13e1bf80b736671f76144cc508d4 Mon Sep 17 00:00:00 2001 From: Marcin Juszkiewicz Date: Wed, 21 Oct 2009 12:22:20 +0200 Subject: linux 2.6.23: keep sched-cfs locally updated to 2.6.23.17 --- .../linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch | 8567 ++++++++++++++++++++ recipes/linux/linux_2.6.23.bb | 4 +- 2 files changed, 8569 insertions(+), 2 deletions(-) create mode 100644 recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch diff --git a/recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch b/recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch new file mode 100644 index 0000000000..77ee5c8f1d --- /dev/null +++ b/recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch @@ -0,0 +1,8567 @@ +--- + Documentation/sched-design-CFS.txt | 67 + + Makefile | 2 + arch/i386/Kconfig | 11 + drivers/kvm/kvm.h | 10 + fs/pipe.c | 9 + fs/proc/array.c | 21 + fs/proc/base.c | 2 + fs/proc/proc_misc.c | 15 + include/linux/cgroup.h | 12 + include/linux/cpuset.h | 5 + include/linux/kernel.h | 7 + include/linux/kernel_stat.h | 3 + include/linux/nodemask.h | 94 + + include/linux/sched.h | 174 ++ + include/linux/taskstats.h | 7 + include/linux/topology.h | 5 + init/Kconfig | 26 + init/main.c | 3 + kernel/delayacct.c | 8 + kernel/exit.c | 6 + kernel/fork.c | 5 + kernel/ksysfs.c | 8 + kernel/sched.c | 2310 +++++++++++++++++++++++-------------- + kernel/sched_debug.c | 289 +++- + kernel/sched_fair.c | 885 ++++++-------- + kernel/sched_idletask.c | 26 + kernel/sched_rt.c | 54 + kernel/sched_stats.h | 40 + kernel/sysctl.c | 40 + kernel/timer.c | 7 + kernel/tsacct.c | 4 + kernel/user.c | 249 +++ + mm/memory_hotplug.c | 7 + mm/page_alloc.c | 50 + mm/vmscan.c | 4 + net/unix/af_unix.c | 4 + 36 files changed, 2883 insertions(+), 1586 deletions(-) + +--- linux-2.6.23.orig/Documentation/sched-design-CFS.txt ++++ linux-2.6.23/Documentation/sched-design-CFS.txt +@@ -115,5 +115,72 @@ Some implementation details: + - reworked/sanitized SMP load-balancing: the runqueue-walking + assumptions are gone from the load-balancing code now, and + iterators of the scheduling modules are used. The balancing code got + quite a bit simpler as a result. + ++ ++Group scheduler extension to CFS ++================================ ++ ++Normally the scheduler operates on individual tasks and strives to provide ++fair CPU time to each task. Sometimes, it may be desirable to group tasks ++and provide fair CPU time to each such task group. For example, it may ++be desirable to first provide fair CPU time to each user on the system ++and then to each task belonging to a user. ++ ++CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets ++SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such ++groups. At present, there are two (mutually exclusive) mechanisms to group ++tasks for CPU bandwidth control purpose: ++ ++ - Based on user id (CONFIG_FAIR_USER_SCHED) ++ In this option, tasks are grouped according to their user id. ++ - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED) ++ This options lets the administrator create arbitrary groups ++ of tasks, using the "cgroup" pseudo filesystem. See ++ Documentation/cgroups.txt for more information about this ++ filesystem. ++ ++Only one of these options to group tasks can be chosen and not both. ++ ++Group scheduler tunables: ++ ++When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for ++each new user and a "cpu_share" file is added in that directory. ++ ++ # cd /sys/kernel/uids ++ # cat 512/cpu_share # Display user 512's CPU share ++ 1024 ++ # echo 2048 > 512/cpu_share # Modify user 512's CPU share ++ # cat 512/cpu_share # Display user 512's CPU share ++ 2048 ++ # ++ ++CPU bandwidth between two users are divided in the ratio of their CPU shares. ++For ex: if you would like user "root" to get twice the bandwidth of user ++"guest", then set the cpu_share for both the users such that "root"'s ++cpu_share is twice "guest"'s cpu_share ++ ++ ++When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created ++for each group created using the pseudo filesystem. See example steps ++below to create task groups and modify their CPU share using the "cgroups" ++pseudo filesystem ++ ++ # mkdir /dev/cpuctl ++ # mount -t cgroup -ocpu none /dev/cpuctl ++ # cd /dev/cpuctl ++ ++ # mkdir multimedia # create "multimedia" group of tasks ++ # mkdir browser # create "browser" group of tasks ++ ++ # #Configure the multimedia group to receive twice the CPU bandwidth ++ # #that of browser group ++ ++ # echo 2048 > multimedia/cpu.shares ++ # echo 1024 > browser/cpu.shares ++ ++ # firefox & # Launch firefox and move it to "browser" group ++ # echo > browser/tasks ++ ++ # #Launch gmplayer (or your favourite movie player) ++ # echo > multimedia/tasks +--- linux-2.6.23.orig/Makefile ++++ linux-2.6.23/Makefile +@@ -1,9 +1,9 @@ + VERSION = 2 + PATCHLEVEL = 6 + SUBLEVEL = 23 +-EXTRAVERSION = .17 ++EXTRAVERSION = .17-cfs-v24.1 + NAME = Arr Matey! A Hairy Bilge Rat! + + # *DOCUMENTATION* + # To see a list of typical targets execute "make help" + # More info can be located in ./README +--- linux-2.6.23.orig/arch/i386/Kconfig ++++ linux-2.6.23/arch/i386/Kconfig +@@ -212,10 +212,21 @@ config X86_ES7000 + Only choose this option if you have such a system, otherwise you + should say N here. + + endchoice + ++config SCHED_NO_NO_OMIT_FRAME_POINTER ++ bool "Single-depth WCHAN output" ++ default y ++ help ++ Calculate simpler /proc//wchan values. If this option ++ is disabled then wchan values will recurse back to the ++ caller function. This provides more accurate wchan values, ++ at the expense of slightly more scheduling overhead. ++ ++ If in doubt, say "Y". ++ + config PARAVIRT + bool "Paravirtualization support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on !(X86_VISWS || X86_VOYAGER) + help +--- linux-2.6.23.orig/drivers/kvm/kvm.h ++++ linux-2.6.23/drivers/kvm/kvm.h +@@ -623,10 +623,20 @@ void __kvm_mmu_free_some_pages(struct kv + int kvm_mmu_load(struct kvm_vcpu *vcpu); + void kvm_mmu_unload(struct kvm_vcpu *vcpu); + + int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); + ++static inline void kvm_guest_enter(void) ++{ ++ current->flags |= PF_VCPU; ++} ++ ++static inline void kvm_guest_exit(void) ++{ ++ current->flags &= ~PF_VCPU; ++} ++ + static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, + u32 error_code) + { + return vcpu->mmu.page_fault(vcpu, gva, error_code); + } +--- linux-2.6.23.orig/fs/pipe.c ++++ linux-2.6.23/fs/pipe.c +@@ -43,12 +43,11 @@ void pipe_wait(struct pipe_inode_info *p + + /* + * Pipes are system-local resources, so sleeping on them + * is considered a noninteractive wait: + */ +- prepare_to_wait(&pipe->wait, &wait, +- TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); ++ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + schedule(); + finish_wait(&pipe->wait, &wait); + if (pipe->inode) +@@ -381,11 +380,11 @@ redo: + } + mutex_unlock(&inode->i_mutex); + + /* Signal writers asynchronously that there is more room. */ + if (do_wakeup) { +- wake_up_interruptible(&pipe->wait); ++ wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + } + if (ret > 0) + file_accessed(filp); + return ret; +@@ -554,11 +553,11 @@ redo2: + pipe->waiting_writers--; + } + out: + mutex_unlock(&inode->i_mutex); + if (do_wakeup) { +- wake_up_interruptible(&pipe->wait); ++ wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } + if (ret > 0) + file_update_time(filp); + return ret; +@@ -648,11 +647,11 @@ pipe_release(struct inode *inode, int de + pipe->writers -= decw; + + if (!pipe->readers && !pipe->writers) { + free_pipe_info(inode); + } else { +- wake_up_interruptible(&pipe->wait); ++ wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + } + mutex_unlock(&inode->i_mutex); + +--- linux-2.6.23.orig/fs/proc/array.c ++++ linux-2.6.23/fs/proc/array.c +@@ -365,15 +365,22 @@ static cputime_t task_stime(struct task_ + * grows monotonically - apps rely on that): + */ + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - + cputime_to_clock_t(task_utime(p)); + +- p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); ++ if (stime >= 0) ++ p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); ++ + return p->prev_stime; + } + #endif + ++static cputime_t task_gtime(struct task_struct *p) ++{ ++ return p->gtime; ++} ++ + static int do_task_stat(struct task_struct *task, char *buffer, int whole) + { + unsigned long vsize, eip, esp, wchan = ~0UL; + long priority, nice; + int tty_pgrp = -1, tty_nr = 0; +@@ -385,10 +392,11 @@ static int do_task_stat(struct task_stru + struct mm_struct *mm; + unsigned long long start_time; + unsigned long cmin_flt = 0, cmaj_flt = 0; + unsigned long min_flt = 0, maj_flt = 0; + cputime_t cutime, cstime, utime, stime; ++ cputime_t cgtime, gtime; + unsigned long rsslim = 0; + char tcomm[sizeof(task->comm)]; + unsigned long flags; + + state = *get_task_state(task); +@@ -403,10 +411,11 @@ static int do_task_stat(struct task_stru + get_task_comm(tcomm, task); + + sigemptyset(&sigign); + sigemptyset(&sigcatch); + cutime = cstime = utime = stime = cputime_zero; ++ cgtime = gtime = cputime_zero; + + rcu_read_lock(); + if (lock_task_sighand(task, &flags)) { + struct signal_struct *sig = task->signal; + +@@ -420,27 +429,30 @@ static int do_task_stat(struct task_stru + + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; ++ cgtime = sig->cgtime; + rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; + + /* add up live thread stats at the group level */ + if (whole) { + struct task_struct *t = task; + do { + min_flt += t->min_flt; + maj_flt += t->maj_flt; + utime = cputime_add(utime, task_utime(t)); + stime = cputime_add(stime, task_stime(t)); ++ gtime = cputime_add(gtime, task_gtime(t)); + t = next_thread(t); + } while (t != task); + + min_flt += sig->min_flt; + maj_flt += sig->maj_flt; + utime = cputime_add(utime, sig->utime); + stime = cputime_add(stime, sig->stime); ++ gtime = cputime_add(gtime, sig->gtime); + } + + sid = signal_session(sig); + pgid = process_group(task); + ppid = rcu_dereference(task->real_parent)->tgid; +@@ -454,10 +466,11 @@ static int do_task_stat(struct task_stru + if (!whole) { + min_flt = task->min_flt; + maj_flt = task->maj_flt; + utime = task_utime(task); + stime = task_stime(task); ++ gtime = task_gtime(task); + } + + /* scale priority and nice values from timeslices to -20..20 */ + /* to make it look like a "normal" Unix priority/nice value */ + priority = task_prio(task); +@@ -471,11 +484,11 @@ static int do_task_stat(struct task_stru + /* convert nsec -> ticks */ + start_time = nsec_to_clock_t(start_time); + + res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \ + %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ +-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n", ++%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", + task->pid, + tcomm, + state, + ppid, + pgid, +@@ -516,11 +529,13 @@ static int do_task_stat(struct task_stru + 0UL, + task->exit_signal, + task_cpu(task), + task->rt_priority, + task->policy, +- (unsigned long long)delayacct_blkio_ticks(task)); ++ (unsigned long long)delayacct_blkio_ticks(task), ++ cputime_to_clock_t(gtime), ++ cputime_to_clock_t(cgtime)); + if (mm) + mmput(mm); + return res; + } + +--- linux-2.6.23.orig/fs/proc/base.c ++++ linux-2.6.23/fs/proc/base.c +@@ -302,11 +302,11 @@ static int proc_pid_wchan(struct task_st + static int proc_pid_schedstat(struct task_struct *task, char *buffer) + { + return sprintf(buffer, "%llu %llu %lu\n", + task->sched_info.cpu_time, + task->sched_info.run_delay, +- task->sched_info.pcnt); ++ task->sched_info.pcount); + } + #endif + + /* The badness from the OOM killer */ + unsigned long badness(struct task_struct *p, unsigned long uptime); +--- linux-2.6.23.orig/fs/proc/proc_misc.c ++++ linux-2.6.23/fs/proc/proc_misc.c +@@ -441,20 +441,22 @@ static const struct file_operations proc + static int show_stat(struct seq_file *p, void *v) + { + int i; + unsigned long jif; + cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; ++ cputime64_t guest; + u64 sum = 0; + struct timespec boottime; + unsigned int *per_irq_sum; + + per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL); + if (!per_irq_sum) + return -ENOMEM; + + user = nice = system = idle = iowait = + irq = softirq = steal = cputime64_zero; ++ guest = cputime64_zero; + getboottime(&boottime); + jif = boottime.tv_sec; + + for_each_possible_cpu(i) { + int j; +@@ -465,26 +467,28 @@ static int show_stat(struct seq_file *p, + idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); + iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); + irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); + softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); + steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); ++ guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); + for (j = 0; j < NR_IRQS; j++) { + unsigned int temp = kstat_cpu(i).irqs[j]; + sum += temp; + per_irq_sum[j] += temp; + } + } + +- seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n", ++ seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cputime64_to_clock_t(idle), + (unsigned long long)cputime64_to_clock_t(iowait), + (unsigned long long)cputime64_to_clock_t(irq), + (unsigned long long)cputime64_to_clock_t(softirq), +- (unsigned long long)cputime64_to_clock_t(steal)); ++ (unsigned long long)cputime64_to_clock_t(steal), ++ (unsigned long long)cputime64_to_clock_t(guest)); + for_each_online_cpu(i) { + + /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ + user = kstat_cpu(i).cpustat.user; + nice = kstat_cpu(i).cpustat.nice; +@@ -492,20 +496,23 @@ static int show_stat(struct seq_file *p, + idle = kstat_cpu(i).cpustat.idle; + iowait = kstat_cpu(i).cpustat.iowait; + irq = kstat_cpu(i).cpustat.irq; + softirq = kstat_cpu(i).cpustat.softirq; + steal = kstat_cpu(i).cpustat.steal; +- seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n", ++ guest = kstat_cpu(i).cpustat.guest; ++ seq_printf(p, ++ "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", + i, + (unsigned long long)cputime64_to_clock_t(user), + (unsigned long long)cputime64_to_clock_t(nice), + (unsigned long long)cputime64_to_clock_t(system), + (unsigned long long)cputime64_to_clock_t(idle), + (unsigned long long)cputime64_to_clock_t(iowait), + (unsigned long long)cputime64_to_clock_t(irq), + (unsigned long long)cputime64_to_clock_t(softirq), +- (unsigned long long)cputime64_to_clock_t(steal)); ++ (unsigned long long)cputime64_to_clock_t(steal), ++ (unsigned long long)cputime64_to_clock_t(guest)); + } + seq_printf(p, "intr %llu", (unsigned long long)sum); + + #ifndef CONFIG_SMP + /* Touches too many cache lines on SMP setups */ +--- /dev/null ++++ linux-2.6.23/include/linux/cgroup.h +@@ -0,0 +1,12 @@ ++#ifndef _LINUX_CGROUP_H ++#define _LINUX_CGROUP_H ++ ++/* ++ * Control groups are not backported - we use a few compatibility ++ * defines to be able to use the upstream sched.c as-is: ++ */ ++#define task_pid_nr(task) (task)->pid ++#define task_pid_vnr(task) (task)->pid ++#define find_task_by_vpid(pid) find_task_by_pid(pid) ++ ++#endif +--- linux-2.6.23.orig/include/linux/cpuset.h ++++ linux-2.6.23/include/linux/cpuset.h +@@ -144,8 +144,13 @@ static inline int cpuset_do_slab_mem_spr + return 0; + } + + static inline void cpuset_track_online_nodes(void) {} + ++static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p) ++{ ++ return cpu_possible_map; ++} ++ + #endif /* !CONFIG_CPUSETS */ + + #endif /* _LINUX_CPUSET_H */ +--- linux-2.6.23.orig/include/linux/kernel.h ++++ linux-2.6.23/include/linux/kernel.h +@@ -59,10 +59,17 @@ extern const char linux_proc_banner[]; + #define KERN_WARNING "<4>" /* warning conditions */ + #define KERN_NOTICE "<5>" /* normal but significant condition */ + #define KERN_INFO "<6>" /* informational */ + #define KERN_DEBUG "<7>" /* debug-level messages */ + ++/* ++ * Annotation for a "continued" line of log printout (only done after a ++ * line that had no enclosing \n). Only to be used by core/arch code ++ * during early bootup (a continued line is not SMP-safe otherwise). ++ */ ++#define KERN_CONT "" ++ + extern int console_printk[]; + + #define console_loglevel (console_printk[0]) + #define default_message_loglevel (console_printk[1]) + #define minimum_console_loglevel (console_printk[2]) +--- linux-2.6.23.orig/include/linux/kernel_stat.h ++++ linux-2.6.23/include/linux/kernel_stat.h +@@ -21,10 +21,11 @@ struct cpu_usage_stat { + cputime64_t softirq; + cputime64_t irq; + cputime64_t idle; + cputime64_t iowait; + cputime64_t steal; ++ cputime64_t guest; + }; + + struct kernel_stat { + struct cpu_usage_stat cpustat; + unsigned int irqs[NR_IRQS]; +@@ -50,9 +51,11 @@ static inline int kstat_irqs(int irq) + + return sum; + } + + extern void account_user_time(struct task_struct *, cputime_t); ++extern void account_user_time_scaled(struct task_struct *, cputime_t); + extern void account_system_time(struct task_struct *, int, cputime_t); ++extern void account_system_time_scaled(struct task_struct *, cputime_t); + extern void account_steal_time(struct task_struct *, cputime_t); + + #endif /* _LINUX_KERNEL_STAT_H */ +--- linux-2.6.23.orig/include/linux/nodemask.h ++++ linux-2.6.23/include/linux/nodemask.h +@@ -336,46 +336,108 @@ static inline void __nodes_remap(nodemas + if (!nodes_empty(mask)) \ + for ((node) = 0; (node) < 1; (node)++) + #endif /* MAX_NUMNODES */ + + /* ++ * Bitmasks that are kept for all the nodes. ++ */ ++enum node_states { ++ N_POSSIBLE, /* The node could become online at some point */ ++ N_ONLINE, /* The node is online */ ++ N_NORMAL_MEMORY, /* The node has regular memory */ ++#ifdef CONFIG_HIGHMEM ++ N_HIGH_MEMORY, /* The node has regular or high memory */ ++#else ++ N_HIGH_MEMORY = N_NORMAL_MEMORY, ++#endif ++ N_CPU, /* The node has one or more cpus */ ++ NR_NODE_STATES ++}; ++ ++/* + * The following particular system nodemasks and operations + * on them manage all possible and online nodes. + */ + +-extern nodemask_t node_online_map; +-extern nodemask_t node_possible_map; ++extern nodemask_t node_states[NR_NODE_STATES]; + + #if MAX_NUMNODES > 1 +-#define num_online_nodes() nodes_weight(node_online_map) +-#define num_possible_nodes() nodes_weight(node_possible_map) +-#define node_online(node) node_isset((node), node_online_map) +-#define node_possible(node) node_isset((node), node_possible_map) +-#define first_online_node first_node(node_online_map) +-#define next_online_node(nid) next_node((nid), node_online_map) ++static inline int node_state(int node, enum node_states state) ++{ ++ return node_isset(node, node_states[state]); ++} ++ ++static inline void node_set_state(int node, enum node_states state) ++{ ++ __node_set(node, &node_states[state]); ++} ++ ++static inline void node_clear_state(int node, enum node_states state) ++{ ++ __node_clear(node, &node_states[state]); ++} ++ ++static inline int num_node_state(enum node_states state) ++{ ++ return nodes_weight(node_states[state]); ++} ++ ++#define for_each_node_state(__node, __state) \ ++ for_each_node_mask((__node), node_states[__state]) ++ ++#define first_online_node first_node(node_states[N_ONLINE]) ++#define next_online_node(nid) next_node((nid), node_states[N_ONLINE]) ++ + extern int nr_node_ids; + #else +-#define num_online_nodes() 1 +-#define num_possible_nodes() 1 +-#define node_online(node) ((node) == 0) +-#define node_possible(node) ((node) == 0) ++ ++static inline int node_state(int node, enum node_states state) ++{ ++ return node == 0; ++} ++ ++static inline void node_set_state(int node, enum node_states state) ++{ ++} ++ ++static inline void node_clear_state(int node, enum node_states state) ++{ ++} ++ ++static inline int num_node_state(enum node_states state) ++{ ++ return 1; ++} ++ ++#define for_each_node_state(node, __state) \ ++ for ( (node) = 0; (node) == 0; (node) = 1) ++ + #define first_online_node 0 + #define next_online_node(nid) (MAX_NUMNODES) + #define nr_node_ids 1 ++ + #endif + ++#define node_online_map node_states[N_ONLINE] ++#define node_possible_map node_states[N_POSSIBLE] ++ + #define any_online_node(mask) \ + ({ \ + int node; \ + for_each_node_mask(node, (mask)) \ + if (node_online(node)) \ + break; \ + node; \ + }) + +-#define node_set_online(node) set_bit((node), node_online_map.bits) +-#define node_set_offline(node) clear_bit((node), node_online_map.bits) ++#define num_online_nodes() num_node_state(N_ONLINE) ++#define num_possible_nodes() num_node_state(N_POSSIBLE) ++#define node_online(node) node_state((node), N_ONLINE) ++#define node_possible(node) node_state((node), N_POSSIBLE) ++ ++#define node_set_online(node) node_set_state((node), N_ONLINE) ++#define node_set_offline(node) node_clear_state((node), N_ONLINE) + +-#define for_each_node(node) for_each_node_mask((node), node_possible_map) +-#define for_each_online_node(node) for_each_node_mask((node), node_online_map) ++#define for_each_node(node) for_each_node_state(node, N_POSSIBLE) ++#define for_each_online_node(node) for_each_node_state(node, N_ONLINE) + + #endif /* __LINUX_NODEMASK_H */ +--- linux-2.6.23.orig/include/linux/sched.h ++++ linux-2.6.23/include/linux/sched.h +@@ -1,10 +1,21 @@ + #ifndef _LINUX_SCHED_H + #define _LINUX_SCHED_H + + #include /* For AT_VECTOR_SIZE */ + ++/* backporting helper macro: */ ++#define cpu_sibling_map(cpu) cpu_sibling_map[cpu] ++ ++/* ++ * * Control groups are not backported - we use a few compatibility ++ * * defines to be able to use the upstream sched.c as-is: ++ * */ ++#define task_pid_nr(task) (task)->pid ++#define task_pid_vnr(task) (task)->pid ++#define find_task_by_vpid(pid) find_task_by_pid(pid) ++ + /* + * cloning flags: + */ + #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ + #define CLONE_VM 0x00000100 /* set if VM shared between processes */ +@@ -84,10 +95,11 @@ struct sched_param { + #include + #include + #include + #include + #include ++#include + + #include + + struct exec_domain; + struct futex_pi_state; +@@ -133,10 +145,11 @@ extern unsigned long nr_active(void); + extern unsigned long nr_iowait(void); + extern unsigned long weighted_cpuload(const int cpu); + + struct seq_file; + struct cfs_rq; ++struct task_group; + #ifdef CONFIG_SCHED_DEBUG + extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); + extern void proc_sched_set_task(struct task_struct *p); + extern void + print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +@@ -171,12 +184,11 @@ print_cfs_rq(struct seq_file *m, int cpu + #define TASK_TRACED 8 + /* in tsk->exit_state */ + #define EXIT_ZOMBIE 16 + #define EXIT_DEAD 32 + /* in tsk->state again */ +-#define TASK_NONINTERACTIVE 64 +-#define TASK_DEAD 128 ++#define TASK_DEAD 64 + + #define __set_task_state(tsk, state_value) \ + do { (tsk)->state = (state_value); } while (0) + #define set_task_state(tsk, state_value) \ + set_mb((tsk)->state, (state_value)) +@@ -276,10 +288,14 @@ static inline void touch_all_softlockup_ + #endif + + + /* Attach to any functions which should be ignored in wchan output. */ + #define __sched __attribute__((__section__(".sched.text"))) ++ ++/* Linker adds these: start and end of __sched functions */ ++extern char __sched_text_start[], __sched_text_end[]; ++ + /* Is this address in the __sched functions? */ + extern int in_sched_functions(unsigned long addr); + + #define MAX_SCHEDULE_TIMEOUT LONG_MAX + extern signed long FASTCALL(schedule_timeout(signed long timeout)); +@@ -513,10 +529,12 @@ struct signal_struct { + * and for reaped dead child processes forked by this group. + * Live threads maintain their own counters and add to these + * in __exit_signal, except for the group leader. + */ + cputime_t utime, stime, cutime, cstime; ++ cputime_t gtime; ++ cputime_t cgtime; + unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; + unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; + unsigned long inblock, oublock, cinblock, coublock; + + /* +@@ -593,12 +611,27 @@ struct user_struct { + #endif + + /* Hash table maintenance information */ + struct hlist_node uidhash_node; + uid_t uid; ++ ++#ifdef CONFIG_FAIR_USER_SCHED ++ struct task_group *tg; ++#ifdef CONFIG_SYSFS ++ struct kset kset; ++ struct subsys_attribute user_attr; ++ struct work_struct work; ++#endif ++#endif + }; + ++#ifdef CONFIG_FAIR_USER_SCHED ++extern int uids_kobject_init(void); ++#else ++static inline int uids_kobject_init(void) { return 0; } ++#endif ++ + extern struct user_struct *find_user(uid_t); + + extern struct user_struct root_user; + #define INIT_USER (&root_user) + +@@ -606,17 +639,21 @@ struct backing_dev_info; + struct reclaim_state; + + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + struct sched_info { + /* cumulative counters */ +- unsigned long pcnt; /* # of times run on this cpu */ ++ unsigned long pcount; /* # of times run on this cpu */ + unsigned long long cpu_time, /* time spent on the cpu */ + run_delay; /* time spent waiting on a runqueue */ + + /* timestamps */ + unsigned long long last_arrival,/* when we last ran on a cpu */ + last_queued; /* when we were last queued to run */ ++#ifdef CONFIG_SCHEDSTATS ++ /* BKL stats */ ++ unsigned int bkl_count; ++#endif + }; + #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ + + #ifdef CONFIG_SCHEDSTATS + extern const struct file_operations proc_schedstat_operations; +@@ -747,43 +784,42 @@ struct sched_domain { + unsigned int balance_interval; /* initialise to 1. units in ms. */ + unsigned int nr_balance_failed; /* initialise to 0 */ + + #ifdef CONFIG_SCHEDSTATS + /* load_balance() stats */ +- unsigned long lb_cnt[CPU_MAX_IDLE_TYPES]; +- unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; +- unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; +- unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; +- unsigned long lb_gained[CPU_MAX_IDLE_TYPES]; +- unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES]; +- unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES]; +- unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; ++ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; ++ unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; ++ unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; ++ unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; ++ unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; ++ unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; ++ unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; ++ unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES]; + + /* Active load balancing */ +- unsigned long alb_cnt; +- unsigned long alb_failed; +- unsigned long alb_pushed; ++ unsigned int alb_count; ++ unsigned int alb_failed; ++ unsigned int alb_pushed; + + /* SD_BALANCE_EXEC stats */ +- unsigned long sbe_cnt; +- unsigned long sbe_balanced; +- unsigned long sbe_pushed; ++ unsigned int sbe_count; ++ unsigned int sbe_balanced; ++ unsigned int sbe_pushed; + + /* SD_BALANCE_FORK stats */ +- unsigned long sbf_cnt; +- unsigned long sbf_balanced; +- unsigned long sbf_pushed; ++ unsigned int sbf_count; ++ unsigned int sbf_balanced; ++ unsigned int sbf_pushed; + + /* try_to_wake_up() stats */ +- unsigned long ttwu_wake_remote; +- unsigned long ttwu_move_affine; +- unsigned long ttwu_move_balance; ++ unsigned int ttwu_wake_remote; ++ unsigned int ttwu_move_affine; ++ unsigned int ttwu_move_balance; + #endif + }; + +-extern int partition_sched_domains(cpumask_t *partition1, +- cpumask_t *partition2); ++extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new); + + #endif /* CONFIG_SMP */ + + /* + * A runqueue laden with a single nice 0 task scores a weighted_cpuload of +@@ -851,27 +887,32 @@ struct uts_namespace; + + struct rq; + struct sched_domain; + + struct sched_class { +- struct sched_class *next; ++ const struct sched_class *next; + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); +- void (*yield_task) (struct rq *rq, struct task_struct *p); ++ void (*yield_task) (struct rq *rq); + + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); + + struct task_struct * (*pick_next_task) (struct rq *rq); + void (*put_prev_task) (struct rq *rq, struct task_struct *p); + ++#ifdef CONFIG_SMP + unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, +- struct rq *busiest, +- unsigned long max_nr_move, unsigned long max_load_move, ++ struct rq *busiest, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio); + ++ int (*move_one_task) (struct rq *this_rq, int this_cpu, ++ struct rq *busiest, struct sched_domain *sd, ++ enum cpu_idle_type idle); ++#endif ++ + void (*set_curr_task) (struct rq *rq); + void (*task_tick) (struct rq *rq, struct task_struct *p); + void (*task_new) (struct rq *rq, struct task_struct *p); + }; + +@@ -885,46 +926,52 @@ struct load_weight { + * Current field usage histogram: + * + * 4 se->block_start + * 4 se->run_node + * 4 se->sleep_start +- * 4 se->sleep_start_fair + * 6 se->load.weight +- * 7 se->delta_fair +- * 15 se->wait_runtime + */ + struct sched_entity { +- long wait_runtime; +- unsigned long delta_fair_run; +- unsigned long delta_fair_sleep; +- unsigned long delta_exec; +- s64 fair_key; + struct load_weight load; /* for load-balancing */ + struct rb_node run_node; + unsigned int on_rq; + + u64 exec_start; + u64 sum_exec_runtime; ++ u64 vruntime; + u64 prev_sum_exec_runtime; +- u64 wait_start_fair; +- u64 sleep_start_fair; + + #ifdef CONFIG_SCHEDSTATS + u64 wait_start; + u64 wait_max; +- s64 sum_wait_runtime; + + u64 sleep_start; + u64 sleep_max; + s64 sum_sleep_runtime; + + u64 block_start; + u64 block_max; + u64 exec_max; ++ u64 slice_max; + +- unsigned long wait_runtime_overruns; +- unsigned long wait_runtime_underruns; ++ u64 nr_migrations; ++ u64 nr_migrations_cold; ++ u64 nr_failed_migrations_affine; ++ u64 nr_failed_migrations_running; ++ u64 nr_failed_migrations_hot; ++ u64 nr_forced_migrations; ++ u64 nr_forced2_migrations; ++ ++ u64 nr_wakeups; ++ u64 nr_wakeups_sync; ++ u64 nr_wakeups_migrate; ++ u64 nr_wakeups_local; ++ u64 nr_wakeups_remote; ++ u64 nr_wakeups_affine; ++ u64 nr_wakeups_affine_attempts; ++ u64 nr_wakeups_passive; ++ u64 nr_wakeups_idle; + #endif + + #ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *parent; + /* rq on which this entity is (to be) queued: */ +@@ -949,11 +996,11 @@ struct task_struct { + #endif + #endif + + int prio, static_prio, normal_prio; + struct list_head run_list; +- struct sched_class *sched_class; ++ const struct sched_class *sched_class; + struct sched_entity se; + + #ifdef CONFIG_PREEMPT_NOTIFIERS + /* list of struct preempt_notifier: */ + struct hlist_head preempt_notifiers; +@@ -1019,11 +1066,12 @@ struct task_struct { + struct completion *vfork_done; /* for vfork() */ + int __user *set_child_tid; /* CLONE_CHILD_SETTID */ + int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ + + unsigned int rt_priority; +- cputime_t utime, stime; ++ cputime_t utime, stime, utimescaled, stimescaled; ++ cputime_t gtime; + cputime_t prev_utime, prev_stime; + unsigned long nvcsw, nivcsw; /* context switch counts */ + struct timespec start_time; /* monotonic time */ + struct timespec real_start_time; /* boot based time */ + /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ +@@ -1312,10 +1360,11 @@ static inline void put_task_struct(struc + #define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ + /* Not implemented yet, only for 486*/ + #define PF_STARTING 0x00000002 /* being created */ + #define PF_EXITING 0x00000004 /* getting shut down */ + #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ ++#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ + #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ + #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ + #define PF_DUMPCORE 0x00000200 /* dumped core */ + #define PF_SIGNALED 0x00000400 /* killed by a signal */ + #define PF_MEMALLOC 0x00000800 /* Allocating memory */ +@@ -1399,19 +1448,30 @@ extern void idle_task_exit(void); + static inline void idle_task_exit(void) {} + #endif + + extern void sched_idle_next(void); + ++#ifdef CONFIG_SCHED_DEBUG + extern unsigned int sysctl_sched_latency; + extern unsigned int sysctl_sched_min_granularity; + extern unsigned int sysctl_sched_wakeup_granularity; + extern unsigned int sysctl_sched_batch_wakeup_granularity; +-extern unsigned int sysctl_sched_stat_granularity; +-extern unsigned int sysctl_sched_runtime_limit; +-extern unsigned int sysctl_sched_compat_yield; + extern unsigned int sysctl_sched_child_runs_first; + extern unsigned int sysctl_sched_features; ++extern unsigned int sysctl_sched_migration_cost; ++extern unsigned int sysctl_sched_nr_migrate; ++#ifdef CONFIG_FAIR_GROUP_SCHED ++extern unsigned int sysctl_sched_min_bal_int_shares; ++extern unsigned int sysctl_sched_max_bal_int_shares; ++#endif ++ ++int sched_nr_latency_handler(struct ctl_table *table, int write, ++ struct file *file, void __user *buffer, size_t *length, ++ loff_t *ppos); ++#endif ++ ++extern unsigned int sysctl_sched_compat_yield; + + #ifdef CONFIG_RT_MUTEXES + extern int rt_mutex_getprio(struct task_struct *p); + extern void rt_mutex_setprio(struct task_struct *p, int prio); + extern void rt_mutex_adjust_pi(struct task_struct *p); +@@ -1841,10 +1901,22 @@ extern long sched_getaffinity(pid_t pid, + + extern int sched_mc_power_savings, sched_smt_power_savings; + + extern void normalize_rt_tasks(void); + ++#ifdef CONFIG_FAIR_GROUP_SCHED ++ ++extern struct task_group init_task_group; ++ ++extern struct task_group *sched_create_group(void); ++extern void sched_destroy_group(struct task_group *tg); ++extern void sched_move_task(struct task_struct *tsk); ++extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); ++extern unsigned long sched_group_shares(struct task_group *tg); ++ ++#endif ++ + #ifdef CONFIG_TASK_XACCT + static inline void add_rchar(struct task_struct *tsk, ssize_t amt) + { + tsk->rchar += amt; + } +@@ -1879,8 +1951,16 @@ static inline void inc_syscr(struct task + static inline void inc_syscw(struct task_struct *tsk) + { + } + #endif + ++#ifdef CONFIG_SMP ++void migration_init(void); ++#else ++static inline void migration_init(void) ++{ ++} ++#endif ++ + #endif /* __KERNEL__ */ + + #endif +--- linux-2.6.23.orig/include/linux/taskstats.h ++++ linux-2.6.23/include/linux/taskstats.h +@@ -29,11 +29,11 @@ + * b) add comment indicating new version number at end of struct + * c) add new fields after version comment; maintain 64-bit alignment + */ + + +-#define TASKSTATS_VERSION 5 ++#define TASKSTATS_VERSION 6 + #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN + * in linux/sched.h */ + + struct taskstats { + +@@ -150,10 +150,15 @@ struct taskstats { + __u64 write_bytes; /* bytes of write I/O */ + __u64 cancelled_write_bytes; /* bytes of cancelled write I/O */ + + __u64 nvcsw; /* voluntary_ctxt_switches */ + __u64 nivcsw; /* nonvoluntary_ctxt_switches */ ++ ++ /* time accounting for SMT machines */ ++ __u64 ac_utimescaled; /* utime scaled on frequency etc */ ++ __u64 ac_stimescaled; /* stime scaled on frequency etc */ ++ __u64 cpu_scaled_run_real_total; /* scaled cpu_run_real_total */ + }; + + + /* + * Commands sent from userspace +--- linux-2.6.23.orig/include/linux/topology.h ++++ linux-2.6.23/include/linux/topology.h +@@ -157,19 +157,18 @@ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 1, \ + .busy_idx = 2, \ +- .idle_idx = 0, \ +- .newidle_idx = 0, \ ++ .idle_idx = 1, \ ++ .newidle_idx = 2, \ + .wake_idx = 1, \ + .forkexec_idx = 1, \ + .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_NEWIDLE \ + | SD_BALANCE_EXEC \ + | SD_WAKE_AFFINE \ +- | SD_WAKE_IDLE \ + | BALANCE_FOR_PKG_POWER,\ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .nr_balance_failed = 0, \ + } +--- linux-2.6.23.orig/init/Kconfig ++++ linux-2.6.23/init/Kconfig +@@ -271,18 +271,44 @@ config LOG_BUF_SHIFT + 12 => 4 KB + + config CPUSETS + bool "Cpuset support" + depends on SMP ++ # ++ # disabled for now - depends on control groups, which ++ # are hard to backport: ++ # ++ depends on 0 + help + This option will let you create and manage CPUSETs which + allow dynamically partitioning a system into sets of CPUs and + Memory Nodes and assigning tasks to run only within those sets. + This is primarily useful on large SMP or NUMA systems. + + Say N if unsure. + ++config FAIR_GROUP_SCHED ++ bool "Fair group CPU scheduler" ++ default y ++ depends on EXPERIMENTAL ++ help ++ This feature lets CPU scheduler recognize task groups and control CPU ++ bandwidth allocation to such task groups. ++ ++choice ++ depends on FAIR_GROUP_SCHED ++ prompt "Basis for grouping tasks" ++ default FAIR_USER_SCHED ++ ++config FAIR_USER_SCHED ++ bool "user id" ++ help ++ This option will choose userid as the basis for grouping ++ tasks, thus providing equal CPU bandwidth to each user. ++ ++endchoice ++ + config SYSFS_DEPRECATED + bool "Create deprecated sysfs files" + default y + help + This option creates deprecated symlinks such as the +--- linux-2.6.23.orig/init/main.c ++++ linux-2.6.23/init/main.c +@@ -750,15 +750,12 @@ static int __init nosoftlockup_setup(cha + __setup("nosoftlockup", nosoftlockup_setup); + + static void __init do_pre_smp_initcalls(void) + { + extern int spawn_ksoftirqd(void); +-#ifdef CONFIG_SMP +- extern int migration_init(void); + + migration_init(); +-#endif + spawn_ksoftirqd(); + if (!nosoftlockup) + spawn_softlockup_task(); + } + +--- linux-2.6.23.orig/kernel/delayacct.c ++++ linux-2.6.23/kernel/delayacct.c +@@ -113,15 +113,21 @@ int __delayacct_add_tsk(struct taskstats + tmp = (s64)d->cpu_run_real_total; + cputime_to_timespec(tsk->utime + tsk->stime, &ts); + tmp += timespec_to_ns(&ts); + d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; + ++ tmp = (s64)d->cpu_scaled_run_real_total; ++ cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); ++ tmp += timespec_to_ns(&ts); ++ d->cpu_scaled_run_real_total = ++ (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; ++ + /* + * No locking available for sched_info (and too expensive to add one) + * Mitigate by taking snapshot of values + */ +- t1 = tsk->sched_info.pcnt; ++ t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; + t3 = tsk->sched_info.cpu_time; + + d->cpu_count += t1; + +--- linux-2.6.23.orig/kernel/exit.c ++++ linux-2.6.23/kernel/exit.c +@@ -109,10 +109,11 @@ static void __exit_signal(struct task_st + * We won't ever get here for the group leader, since it + * will have been the last reference on the signal_struct. + */ + sig->utime = cputime_add(sig->utime, tsk->utime); + sig->stime = cputime_add(sig->stime, tsk->stime); ++ sig->gtime = cputime_add(sig->gtime, tsk->gtime); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); +@@ -1240,10 +1241,15 @@ static int wait_task_zombie(struct task_ + psig->cstime = + cputime_add(psig->cstime, + cputime_add(p->stime, + cputime_add(sig->stime, + sig->cstime))); ++ psig->cgtime = ++ cputime_add(psig->cgtime, ++ cputime_add(p->gtime, ++ cputime_add(sig->gtime, ++ sig->cgtime))); + psig->cmin_flt += + p->min_flt + sig->min_flt + sig->cmin_flt; + psig->cmaj_flt += + p->maj_flt + sig->maj_flt + sig->cmaj_flt; + psig->cnvcsw += +--- linux-2.6.23.orig/kernel/fork.c ++++ linux-2.6.23/kernel/fork.c +@@ -875,10 +875,12 @@ static inline int copy_signal(unsigned l + + sig->leader = 0; /* session leadership doesn't inherit */ + sig->tty_old_pgrp = NULL; + + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; ++ sig->gtime = cputime_zero; ++ sig->cgtime = cputime_zero; + sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; + sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; + sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; + sig->sum_sched_runtime = 0; + INIT_LIST_HEAD(&sig->cpu_timers[0]); +@@ -1045,10 +1047,13 @@ static struct task_struct *copy_process( + + p->utime = cputime_zero; + p->stime = cputime_zero; + p->prev_utime = cputime_zero; + p->prev_stime = cputime_zero; ++ p->gtime = cputime_zero; ++ p->utimescaled = cputime_zero; ++ p->stimescaled = cputime_zero; + + #ifdef CONFIG_TASK_XACCT + p->rchar = 0; /* I/O counter: bytes read */ + p->wchar = 0; /* I/O counter: bytes written */ + p->syscr = 0; /* I/O counter: read syscalls */ +--- linux-2.6.23.orig/kernel/ksysfs.c ++++ linux-2.6.23/kernel/ksysfs.c +@@ -12,10 +12,11 @@ + #include + #include + #include + #include + #include ++#include + + #define KERNEL_ATTR_RO(_name) \ + static struct subsys_attribute _name##_attr = __ATTR_RO(_name) + + #define KERNEL_ATTR_RW(_name) \ +@@ -114,9 +115,16 @@ static int __init ksysfs_init(void) + notes_attr.size = notes_size; + error = sysfs_create_bin_file(&kernel_subsys.kobj, + ¬es_attr); + } + ++ /* ++ * Create "/sys/kernel/uids" directory and corresponding root user's ++ * directory under it. ++ */ ++ if (!error) ++ error = uids_kobject_init(); ++ + return error; + } + + core_initcall(ksysfs_init); +--- linux-2.6.23.orig/kernel/sched.c ++++ linux-2.6.23/kernel/sched.c +@@ -42,10 +42,11 @@ + #include + #include + #include + #include + #include ++#include + #include + #include + #include + #include + #include +@@ -59,21 +60,23 @@ + #include + #include + #include + #include + #include ++#include + + #include ++#include + + /* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ + unsigned long long __attribute__((weak)) sched_clock(void) + { +- return (unsigned long long)jiffies * (1000000000 / HZ); ++ return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); + } + + /* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], +@@ -93,24 +96,22 @@ unsigned long long __attribute__((weak)) + #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + + /* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +-#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +-#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) ++#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) ++#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ)) + + #define NICE_0_LOAD SCHED_LOAD_SCALE + #define NICE_0_SHIFT SCHED_LOAD_SHIFT + + /* + * These are the 'tuning knobs' of the scheduler: + * +- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), +- * default timeslice is 100 msecs, maximum timeslice is 800 msecs. ++ * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +-#define MIN_TIMESLICE max(5 * HZ / 1000, 1) + #define DEF_TIMESLICE (100 * HZ / 1000) + + #ifdef CONFIG_SMP + /* + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) +@@ -130,28 +131,10 @@ static inline void sg_inc_cpu_power(stru + sg->__cpu_power += val; + sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); + } + #endif + +-#define SCALE_PRIO(x, prio) \ +- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) +- +-/* +- * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] +- * to time slice values: [800ms ... 100ms ... 5ms] +- */ +-static unsigned int static_prio_timeslice(int static_prio) +-{ +- if (static_prio == NICE_TO_PRIO(19)) +- return 1; +- +- if (static_prio < NICE_TO_PRIO(0)) +- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); +- else +- return SCALE_PRIO(DEF_TIMESLICE, static_prio); +-} +- + static inline int rt_policy(int policy) + { + if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) + return 1; + return 0; +@@ -168,45 +151,115 @@ static inline int task_has_rt_policy(str + struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; + }; + +-struct load_stat { +- struct load_weight load; +- u64 load_update_start, load_update_last; +- unsigned long delta_fair, delta_exec, delta_stat; ++#ifdef CONFIG_FAIR_GROUP_SCHED ++ ++#include ++ ++struct cfs_rq; ++ ++/* task group related information */ ++struct task_group { ++#ifdef CONFIG_FAIR_CGROUP_SCHED ++ struct cgroup_subsys_state css; ++#endif ++ /* schedulable entities of this group on each cpu */ ++ struct sched_entity **se; ++ /* runqueue "owned" by this group on each cpu */ ++ struct cfs_rq **cfs_rq; ++ unsigned long shares; ++ /* spinlock to serialize modification to shares */ ++ spinlock_t lock; ++ struct rcu_head rcu; ++}; ++ ++/* Default task group's sched entity on each cpu */ ++static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); ++/* Default task group's cfs_rq on each cpu */ ++static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; ++ ++static struct sched_entity *init_sched_entity_p[NR_CPUS]; ++static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; ++ ++/* Default task group. ++ * Every task in system belong to this group at bootup. ++ */ ++struct task_group init_task_group = { ++ .se = init_sched_entity_p, ++ .cfs_rq = init_cfs_rq_p, + }; + ++#ifdef CONFIG_FAIR_USER_SCHED ++# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD ++#else ++# define INIT_TASK_GRP_LOAD NICE_0_LOAD ++#endif ++ ++static int init_task_group_load = INIT_TASK_GRP_LOAD; ++ ++/* return group to which a task belongs */ ++static inline struct task_group *task_group(struct task_struct *p) ++{ ++ struct task_group *tg; ++ ++#ifdef CONFIG_FAIR_USER_SCHED ++ tg = p->user->tg; ++#elif defined(CONFIG_FAIR_CGROUP_SCHED) ++ tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), ++ struct task_group, css); ++#else ++ tg = &init_task_group; ++#endif ++ return tg; ++} ++ ++/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ ++static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) ++{ ++ p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; ++ p->se.parent = task_group(p)->se[cpu]; ++} ++ ++#else ++ ++static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } ++ ++#endif /* CONFIG_FAIR_GROUP_SCHED */ ++ + /* CFS-related fields in a runqueue */ + struct cfs_rq { + struct load_weight load; + unsigned long nr_running; + +- s64 fair_clock; + u64 exec_clock; +- s64 wait_runtime; +- u64 sleeper_bonus; +- unsigned long wait_runtime_overruns, wait_runtime_underruns; ++ u64 min_vruntime; + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + struct rb_node *rb_load_balance_curr; +-#ifdef CONFIG_FAIR_GROUP_SCHED + /* 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr; ++ ++ unsigned long nr_spread_over; ++ ++#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + +- /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in ++ /* ++ * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ +- struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ ++ struct list_head leaf_cfs_rq_list; ++ struct task_group *tg; /* group that "owns" this runqueue */ + #endif + }; + + /* Real-Time classes' related field in a runqueue: */ + struct rt_rq { +@@ -221,11 +274,12 @@ struct rt_rq { + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ + struct rq { +- spinlock_t lock; /* runqueue lock */ ++ /* runqueue lock: */ ++ spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ +@@ -234,19 +288,21 @@ struct rq { + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned char idle_at_tick; + #ifdef CONFIG_NO_HZ + unsigned char in_nohz_recently; + #endif +- struct load_stat ls; /* capture load from *all* tasks on this cpu */ ++ /* capture load from *all* tasks on this cpu: */ ++ struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; + #ifdef CONFIG_FAIR_GROUP_SCHED +- struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */ ++ /* list of leaf cfs_rq on this cpu: */ ++ struct list_head leaf_cfs_rq_list; + #endif +- struct rt_rq rt; ++ struct rt_rq rt; + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease +@@ -272,34 +328,38 @@ struct rq { + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; +- int cpu; /* cpu of this runqueue */ ++ /* cpu of this runqueue: */ ++ int cpu; + + struct task_struct *migration_thread; + struct list_head migration_queue; + #endif + + #ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + + /* sys_sched_yield() stats */ +- unsigned long yld_exp_empty; +- unsigned long yld_act_empty; +- unsigned long yld_both_empty; +- unsigned long yld_cnt; ++ unsigned int yld_exp_empty; ++ unsigned int yld_act_empty; ++ unsigned int yld_both_empty; ++ unsigned int yld_count; + + /* schedule() stats */ +- unsigned long sched_switch; +- unsigned long sched_cnt; +- unsigned long sched_goidle; ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; + + /* try_to_wake_up() stats */ +- unsigned long ttwu_cnt; +- unsigned long ttwu_local; ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++ ++ /* BKL stats */ ++ unsigned int bkl_count; + #endif + struct lock_class_key rq_lock_key; + }; + + static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +@@ -380,10 +440,45 @@ static void update_rq_clock(struct rq *r + #define this_rq() (&__get_cpu_var(runqueues)) + #define task_rq(p) cpu_rq(task_cpu(p)) + #define cpu_curr(cpu) (cpu_rq(cpu)->curr) + + /* ++ * Tunables that become constants when CONFIG_SCHED_DEBUG is off: ++ */ ++#ifdef CONFIG_SCHED_DEBUG ++# define const_debug __read_mostly ++#else ++# define const_debug static const ++#endif ++ ++/* ++ * Debugging: various feature bits ++ */ ++enum { ++ SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, ++ SCHED_FEAT_WAKEUP_PREEMPT = 2, ++ SCHED_FEAT_START_DEBIT = 4, ++ SCHED_FEAT_TREE_AVG = 8, ++ SCHED_FEAT_APPROX_AVG = 16, ++}; ++ ++const_debug unsigned int sysctl_sched_features = ++ SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | ++ SCHED_FEAT_WAKEUP_PREEMPT * 1 | ++ SCHED_FEAT_START_DEBIT * 1 | ++ SCHED_FEAT_TREE_AVG * 0 | ++ SCHED_FEAT_APPROX_AVG * 0; ++ ++#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) ++ ++/* ++ * Number of tasks to iterate in a single balance run. ++ * Limited because this is done with IRQs disabled. ++ */ ++const_debug unsigned int sysctl_sched_nr_migrate = 32; ++ ++/* + * For kernel-internal use: high-speed (but slightly incorrect) per-cpu + * clock constructed from sched_clock(): + */ + unsigned long long cpu_clock(int cpu) + { +@@ -391,40 +486,39 @@ unsigned long long cpu_clock(int cpu) + unsigned long flags; + struct rq *rq; + + local_irq_save(flags); + rq = cpu_rq(cpu); +- update_rq_clock(rq); ++ /* ++ * Only call sched_clock() if the scheduler has already been ++ * initialized (some code might call cpu_clock() very early): ++ */ ++ if (rq->idle) ++ update_rq_clock(rq); + now = rq->clock; + local_irq_restore(flags); + + return now; + } +- +-#ifdef CONFIG_FAIR_GROUP_SCHED +-/* Change a task's ->cfs_rq if it moves across CPUs */ +-static inline void set_task_cfs_rq(struct task_struct *p) +-{ +- p->se.cfs_rq = &task_rq(p)->cfs; +-} +-#else +-static inline void set_task_cfs_rq(struct task_struct *p) +-{ +-} +-#endif ++EXPORT_SYMBOL_GPL(cpu_clock); + + #ifndef prepare_arch_switch + # define prepare_arch_switch(next) do { } while (0) + #endif + #ifndef finish_arch_switch + # define finish_arch_switch(prev) do { } while (0) + #endif + ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ + #ifndef __ARCH_WANT_UNLOCKED_CTXSW + static inline int task_running(struct rq *rq, struct task_struct *p) + { +- return rq->curr == p; ++ return task_current(rq, p); + } + + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) + { + } +@@ -449,11 +543,11 @@ static inline void finish_lock_switch(st + static inline int task_running(struct rq *rq, struct task_struct *p) + { + #ifdef CONFIG_SMP + return p->oncpu; + #else +- return rq->curr == p; ++ return task_current(rq, p); + #endif + } + + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) + { +@@ -494,44 +588,40 @@ static inline void finish_lock_switch(st + * Must be called interrupts disabled. + */ + static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) + { +- struct rq *rq; +- +-repeat_lock_task: +- rq = task_rq(p); +- spin_lock(&rq->lock); +- if (unlikely(rq != task_rq(p))) { ++ for (;;) { ++ struct rq *rq = task_rq(p); ++ spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p))) ++ return rq; + spin_unlock(&rq->lock); +- goto repeat_lock_task; + } +- return rq; + } + + /* + * task_rq_lock - lock the runqueue a given task resides on and disable +- * interrupts. Note the ordering: we can safely lookup the task_rq without ++ * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ + static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(rq->lock) + { + struct rq *rq; + +-repeat_lock_task: +- local_irq_save(*flags); +- rq = task_rq(p); +- spin_lock(&rq->lock); +- if (unlikely(rq != task_rq(p))) { ++ for (;;) { ++ local_irq_save(*flags); ++ rq = task_rq(p); ++ spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p))) ++ return rq; + spin_unlock_irqrestore(&rq->lock, *flags); +- goto repeat_lock_task; + } +- return rq; + } + +-static inline void __task_rq_unlock(struct rq *rq) ++static void __task_rq_unlock(struct rq *rq) + __releases(rq->lock) + { + spin_unlock(&rq->lock); + } + +@@ -542,11 +632,11 @@ static inline void task_rq_unlock(struct + } + + /* + * this_rq_lock - lock this runqueue and disable interrupts. + */ +-static inline struct rq *this_rq_lock(void) ++static struct rq *this_rq_lock(void) + __acquires(rq->lock) + { + struct rq *rq; + + local_irq_disable(); +@@ -576,10 +666,11 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep + void sched_clock_idle_wakeup_event(u64 delta_ns) + { + struct rq *rq = cpu_rq(smp_processor_id()); + u64 now = sched_clock(); + ++ touch_softlockup_watchdog(); + rq->idle_clock += delta_ns; + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the +@@ -642,23 +733,10 @@ static inline void resched_task(struct t + assert_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); + } + #endif + +-static u64 div64_likely32(u64 divident, unsigned long divisor) +-{ +-#if BITS_PER_LONG == 32 +- if (likely(divident <= 0xffffffffULL)) +- return (u32)divident / divisor; +- do_div(divident, divisor); +- +- return divident; +-#else +- return divident / divisor; +-#endif +-} +- + #if BITS_PER_LONG == 32 + # define WMULT_CONST (~0UL) + #else + # define WMULT_CONST (1UL << 32) + #endif +@@ -696,27 +774,25 @@ static inline unsigned long + calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) + { + return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); + } + +-static void update_load_add(struct load_weight *lw, unsigned long inc) ++static inline void update_load_add(struct load_weight *lw, unsigned long inc) + { + lw->weight += inc; +- lw->inv_weight = 0; + } + +-static void update_load_sub(struct load_weight *lw, unsigned long dec) ++static inline void update_load_sub(struct load_weight *lw, unsigned long dec) + { + lw->weight -= dec; +- lw->inv_weight = 0; + } + + /* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its +- * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a ++ * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + + #define WEIGHT_IDLEPRIO 2 +@@ -774,76 +850,62 @@ struct rq_iterator { + void *arg; + struct task_struct *(*start)(void *); + struct task_struct *(*next)(void *); + }; + +-static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, +- unsigned long max_nr_move, unsigned long max_load_move, +- struct sched_domain *sd, enum cpu_idle_type idle, +- int *all_pinned, unsigned long *load_moved, +- int *this_best_prio, struct rq_iterator *iterator); ++#ifdef CONFIG_SMP ++static unsigned long ++balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, ++ unsigned long max_load_move, struct sched_domain *sd, ++ enum cpu_idle_type idle, int *all_pinned, ++ int *this_best_prio, struct rq_iterator *iterator); ++ ++static int ++iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, ++ struct sched_domain *sd, enum cpu_idle_type idle, ++ struct rq_iterator *iterator); ++#endif ++ ++#ifdef CONFIG_CGROUP_CPUACCT ++static void cpuacct_charge(struct task_struct *tsk, u64 cputime); ++#else ++static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} ++#endif + + #include "sched_stats.h" +-#include "sched_rt.c" +-#include "sched_fair.c" + #include "sched_idletask.c" ++#include "sched_fair.c" ++#include "sched_rt.c" + #ifdef CONFIG_SCHED_DEBUG + # include "sched_debug.c" + #endif + + #define sched_class_highest (&rt_sched_class) + +-static void __update_curr_load(struct rq *rq, struct load_stat *ls) +-{ +- if (rq->curr != rq->idle && ls->load.weight) { +- ls->delta_exec += ls->delta_stat; +- ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); +- ls->delta_stat = 0; +- } +-} +- + /* + * Update delta_exec, delta_fair fields for rq. + * + * delta_fair clock advances at a rate inversely proportional to +- * total load (rq->ls.load.weight) on the runqueue, while ++ * total load (rq->load.weight) on the runqueue, while + * delta_exec advances at the same rate as wall-clock (provided + * cpu is not idle). + * + * delta_exec / delta_fair is a measure of the (smoothened) load on this + * runqueue over any given interval. This (smoothened) load is used + * during load balance. + * +- * This function is called /before/ updating rq->ls.load ++ * This function is called /before/ updating rq->load + * and when switching tasks. + */ +-static void update_curr_load(struct rq *rq) +-{ +- struct load_stat *ls = &rq->ls; +- u64 start; +- +- start = ls->load_update_start; +- ls->load_update_start = rq->clock; +- ls->delta_stat += rq->clock - start; +- /* +- * Stagger updates to ls->delta_fair. Very frequent updates +- * can be expensive. +- */ +- if (ls->delta_stat >= sysctl_sched_stat_granularity) +- __update_curr_load(rq, ls); +-} +- + static inline void inc_load(struct rq *rq, const struct task_struct *p) + { +- update_curr_load(rq); +- update_load_add(&rq->ls.load, p->se.load.weight); ++ update_load_add(&rq->load, p->se.load.weight); + } + + static inline void dec_load(struct rq *rq, const struct task_struct *p) + { +- update_curr_load(rq); +- update_load_sub(&rq->ls.load, p->se.load.weight); ++ update_load_sub(&rq->load, p->se.load.weight); + } + + static void inc_nr_running(struct task_struct *p, struct rq *rq) + { + rq->nr_running++; +@@ -856,12 +918,10 @@ static void dec_nr_running(struct task_s + dec_load(rq, p); + } + + static void set_load_weight(struct task_struct *p) + { +- p->se.wait_runtime = 0; +- + if (task_has_rt_policy(p)) { + p->se.load.weight = prio_to_weight[0] * 2; + p->se.load.inv_weight = prio_to_wmult[0] >> 1; + return; + } +@@ -949,24 +1009,10 @@ static void activate_task(struct rq *rq, + enqueue_task(rq, p, wakeup); + inc_nr_running(p, rq); + } + + /* +- * activate_idle_task - move idle task to the _front_ of runqueue. +- */ +-static inline void activate_idle_task(struct task_struct *p, struct rq *rq) +-{ +- update_rq_clock(rq); +- +- if (p->state == TASK_UNINTERRUPTIBLE) +- rq->nr_uninterruptible--; +- +- enqueue_task(rq, p, 0); +- inc_nr_running(p, rq); +-} +- +-/* + * deactivate_task - remove a task from the runqueue. + */ + static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) + { + if (p->state == TASK_UNINTERRUPTIBLE) +@@ -986,45 +1032,76 @@ inline int task_curr(const struct task_s + } + + /* Used instead of source_load when we know the type == 0 */ + unsigned long weighted_cpuload(const int cpu) + { +- return cpu_rq(cpu)->ls.load.weight; ++ return cpu_rq(cpu)->load.weight; + } + + static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) + { ++ set_task_cfs_rq(p, cpu); + #ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be ++ * successfuly executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); + task_thread_info(p)->cpu = cpu; +- set_task_cfs_rq(p); + #endif + } + + #ifdef CONFIG_SMP + ++/* ++ * Is this task likely cache-hot: ++ */ ++static inline int ++task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) ++{ ++ s64 delta; ++ ++ if (p->sched_class