linux 2.6.23: keep sched-cfs locally updated to 2.6.23.17

author: Marcin Juszkiewicz <marcin@juszkiewicz.com.pl> 2009-10-21 12:22:20 +0200
committer: Marcin Juszkiewicz <marcin@juszkiewicz.com.pl> 2009-12-14 11:34:58 +0100
commit: 9d1b79b7848e13e1bf80b736671f76144cc508d4 (patch)
tree: c5e1ff0f4b3c2f08444666fdf8348aa8610fc98d
parent: 575cf43aa9df4192aa9125258545e7943a45f4d5 (diff)
2 files changed, 8569 insertions, 2 deletions
diff --git a/recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch b/recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch
new file mode 100644
index 0000000000..77ee5c8f1d
--- /dev/null
+++ b/recipes/linux/linux-2.6.23/sched-cfs-v2.6.23.12-v24.1.patch
@@ -0,0 +1,8567 @@
+---
+ Documentation/sched-design-CFS.txt |   67 +
+ Makefile                           |    2 
+ arch/i386/Kconfig                  |   11 
+ drivers/kvm/kvm.h                  |   10 
+ fs/pipe.c                          |    9 
+ fs/proc/array.c                    |   21 
+ fs/proc/base.c                     |    2 
+ fs/proc/proc_misc.c                |   15 
+ include/linux/cgroup.h             |   12 
+ include/linux/cpuset.h             |    5 
+ include/linux/kernel.h             |    7 
+ include/linux/kernel_stat.h        |    3 
+ include/linux/nodemask.h           |   94 +
+ include/linux/sched.h              |  174 ++
+ include/linux/taskstats.h          |    7 
+ include/linux/topology.h           |    5 
+ init/Kconfig                       |   26 
+ init/main.c                        |    3 
+ kernel/delayacct.c                 |    8 
+ kernel/exit.c                      |    6 
+ kernel/fork.c                      |    5 
+ kernel/ksysfs.c                    |    8 
+ kernel/sched.c                     | 2310 +++++++++++++++++++++++--------------
+ kernel/sched_debug.c               |  289 +++-
+ kernel/sched_fair.c                |  885 ++++++--------
+ kernel/sched_idletask.c            |   26 
+ kernel/sched_rt.c                  |   54 
+ kernel/sched_stats.h               |   40 
+ kernel/sysctl.c                    |   40 
+ kernel/timer.c                     |    7 
+ kernel/tsacct.c                    |    4 
+ kernel/user.c                      |  249 +++
+ mm/memory_hotplug.c                |    7 
+ mm/page_alloc.c                    |   50 
+ mm/vmscan.c                        |    4 
+ net/unix/af_unix.c                 |    4 
+ 36 files changed, 2883 insertions(+), 1586 deletions(-)
+
+--- linux-2.6.23.orig/Documentation/sched-design-CFS.txt
++++ linux-2.6.23/Documentation/sched-design-CFS.txt
+@@ -115,5 +115,72 @@ Some implementation details:
+  - reworked/sanitized SMP load-balancing: the runqueue-walking
+    assumptions are gone from the load-balancing code now, and
+    iterators of the scheduling modules are used. The balancing code got
+    quite a bit simpler as a result.
+ 
++
++Group scheduler extension to CFS
++================================
++
++Normally the scheduler operates on individual tasks and strives to provide
++fair CPU time to each task. Sometimes, it may be desirable to group tasks
++and provide fair CPU time to each such task group. For example, it may
++be desirable to first provide fair CPU time to each user on the system
++and then to each task belonging to a user.
++
++CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
++SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
++groups. At present, there are two (mutually exclusive) mechanisms to group
++tasks for CPU bandwidth control purpose:
++
++	- Based on user id (CONFIG_FAIR_USER_SCHED)
++		In this option, tasks are grouped according to their user id.
++	- Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
++		This options lets the administrator create arbitrary groups
++		of tasks, using the "cgroup" pseudo filesystem. See
++		Documentation/cgroups.txt for more information about this
++		filesystem.
++
++Only one of these options to group tasks can be chosen and not both.
++
++Group scheduler tunables:
++
++When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
++each new user and a "cpu_share" file is added in that directory.
++
++	# cd /sys/kernel/uids
++	# cat 512/cpu_share		# Display user 512's CPU share
++	1024
++	# echo 2048 > 512/cpu_share	# Modify user 512's CPU share
++	# cat 512/cpu_share		# Display user 512's CPU share
++	2048
++	#
++
++CPU bandwidth between two users are divided in the ratio of their CPU shares.
++For ex: if you would like user "root" to get twice the bandwidth of user
++"guest", then set the cpu_share for both the users such that "root"'s
++cpu_share is twice "guest"'s cpu_share
++
++
++When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
++for each group created using the pseudo filesystem. See example steps
++below to create task groups and modify their CPU share using the "cgroups"
++pseudo filesystem
++
++	# mkdir /dev/cpuctl
++	# mount -t cgroup -ocpu none /dev/cpuctl
++	# cd /dev/cpuctl
++
++	# mkdir multimedia	# create "multimedia" group of tasks
++	# mkdir browser		# create "browser" group of tasks
++
++	# #Configure the multimedia group to receive twice the CPU bandwidth
++	# #that of browser group
++
++	# echo 2048 > multimedia/cpu.shares
++	# echo 1024 > browser/cpu.shares
++
++	# firefox &	# Launch firefox and move it to "browser" group
++	# echo <firefox_pid> > browser/tasks
++
++	# #Launch gmplayer (or your favourite movie player)
++	# echo <movie_player_pid> > multimedia/tasks
+--- linux-2.6.23.orig/Makefile
++++ linux-2.6.23/Makefile
+@@ -1,9 +1,9 @@
+ VERSION = 2
+ PATCHLEVEL = 6
+ SUBLEVEL = 23
+-EXTRAVERSION = .17
++EXTRAVERSION = .17-cfs-v24.1
+ NAME = Arr Matey! A Hairy Bilge Rat!
+ 
+ # *DOCUMENTATION*
+ # To see a list of typical targets execute "make help"
+ # More info can be located in ./README
+--- linux-2.6.23.orig/arch/i386/Kconfig
++++ linux-2.6.23/arch/i386/Kconfig
+@@ -212,10 +212,21 @@ config X86_ES7000
+ 	  Only choose this option if you have such a system, otherwise you
+ 	  should say N here.
+ 
+ endchoice
+ 
++config SCHED_NO_NO_OMIT_FRAME_POINTER
++	bool "Single-depth WCHAN output"
++	default y
++	help
++	  Calculate simpler /proc/<PID>/wchan values. If this option
++	  is disabled then wchan values will recurse back to the
++	  caller function. This provides more accurate wchan values,
++	  at the expense of slightly more scheduling overhead.
++
++	  If in doubt, say "Y".
++
+ config PARAVIRT
+ 	bool "Paravirtualization support (EXPERIMENTAL)"
+ 	depends on EXPERIMENTAL
+ 	depends on !(X86_VISWS || X86_VOYAGER)
+ 	help
+--- linux-2.6.23.orig/drivers/kvm/kvm.h
++++ linux-2.6.23/drivers/kvm/kvm.h
+@@ -623,10 +623,20 @@ void __kvm_mmu_free_some_pages(struct kv
+ int kvm_mmu_load(struct kvm_vcpu *vcpu);
+ void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+ 
+ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run);
+ 
++static inline void kvm_guest_enter(void)
++{
++	current->flags |= PF_VCPU;
++}
++
++static inline void kvm_guest_exit(void)
++{
++	current->flags &= ~PF_VCPU;
++}
++
+ static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+ 				     u32 error_code)
+ {
+ 	return vcpu->mmu.page_fault(vcpu, gva, error_code);
+ }
+--- linux-2.6.23.orig/fs/pipe.c
++++ linux-2.6.23/fs/pipe.c
+@@ -43,12 +43,11 @@ void pipe_wait(struct pipe_inode_info *p
+ 
+ 	/*
+ 	 * Pipes are system-local resources, so sleeping on them
+ 	 * is considered a noninteractive wait:
+ 	 */
+-	prepare_to_wait(&pipe->wait, &wait,
+-			TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
++	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
+ 	if (pipe->inode)
+ 		mutex_unlock(&pipe->inode->i_mutex);
+ 	schedule();
+ 	finish_wait(&pipe->wait, &wait);
+ 	if (pipe->inode)
+@@ -381,11 +380,11 @@ redo:
+ 	}
+ 	mutex_unlock(&inode->i_mutex);
+ 
+ 	/* Signal writers asynchronously that there is more room. */
+ 	if (do_wakeup) {
+-		wake_up_interruptible(&pipe->wait);
++		wake_up_interruptible_sync(&pipe->wait);
+ 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+ 	}
+ 	if (ret > 0)
+ 		file_accessed(filp);
+ 	return ret;
+@@ -554,11 +553,11 @@ redo2:
+ 		pipe->waiting_writers--;
+ 	}
+ out:
+ 	mutex_unlock(&inode->i_mutex);
+ 	if (do_wakeup) {
+-		wake_up_interruptible(&pipe->wait);
++		wake_up_interruptible_sync(&pipe->wait);
+ 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ 	}
+ 	if (ret > 0)
+ 		file_update_time(filp);
+ 	return ret;
+@@ -648,11 +647,11 @@ pipe_release(struct inode *inode, int de
+ 	pipe->writers -= decw;
+ 
+ 	if (!pipe->readers && !pipe->writers) {
+ 		free_pipe_info(inode);
+ 	} else {
+-		wake_up_interruptible(&pipe->wait);
++		wake_up_interruptible_sync(&pipe->wait);
+ 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+ 	}
+ 	mutex_unlock(&inode->i_mutex);
+ 
+--- linux-2.6.23.orig/fs/proc/array.c
++++ linux-2.6.23/fs/proc/array.c
+@@ -365,15 +365,22 @@ static cputime_t task_stime(struct task_
+ 	 * grows monotonically - apps rely on that):
+ 	 */
+ 	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+ 			cputime_to_clock_t(task_utime(p));
+ 
+-	p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
++	if (stime >= 0)
++		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
++
+ 	return p->prev_stime;
+ }
+ #endif
+ 
++static cputime_t task_gtime(struct task_struct *p)
++{
++	return p->gtime;
++}
++
+ static int do_task_stat(struct task_struct *task, char *buffer, int whole)
+ {
+ 	unsigned long vsize, eip, esp, wchan = ~0UL;
+ 	long priority, nice;
+ 	int tty_pgrp = -1, tty_nr = 0;
+@@ -385,10 +392,11 @@ static int do_task_stat(struct task_stru
+ 	struct mm_struct *mm;
+ 	unsigned long long start_time;
+ 	unsigned long cmin_flt = 0, cmaj_flt = 0;
+ 	unsigned long  min_flt = 0,  maj_flt = 0;
+ 	cputime_t cutime, cstime, utime, stime;
++	cputime_t cgtime, gtime;
+ 	unsigned long rsslim = 0;
+ 	char tcomm[sizeof(task->comm)];
+ 	unsigned long flags;
+ 
+ 	state = *get_task_state(task);
+@@ -403,10 +411,11 @@ static int do_task_stat(struct task_stru
+ 	get_task_comm(tcomm, task);
+ 
+ 	sigemptyset(&sigign);
+ 	sigemptyset(&sigcatch);
+ 	cutime = cstime = utime = stime = cputime_zero;
++	cgtime = gtime = cputime_zero;
+ 
+ 	rcu_read_lock();
+ 	if (lock_task_sighand(task, &flags)) {
+ 		struct signal_struct *sig = task->signal;
+ 
+@@ -420,27 +429,30 @@ static int do_task_stat(struct task_stru
+ 
+ 		cmin_flt = sig->cmin_flt;
+ 		cmaj_flt = sig->cmaj_flt;
+ 		cutime = sig->cutime;
+ 		cstime = sig->cstime;
++		cgtime = sig->cgtime;
+ 		rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
+ 
+ 		/* add up live thread stats at the group level */
+ 		if (whole) {
+ 			struct task_struct *t = task;
+ 			do {
+ 				min_flt += t->min_flt;
+ 				maj_flt += t->maj_flt;
+ 				utime = cputime_add(utime, task_utime(t));
+ 				stime = cputime_add(stime, task_stime(t));
++				gtime = cputime_add(gtime, task_gtime(t));
+ 				t = next_thread(t);
+ 			} while (t != task);
+ 
+ 			min_flt += sig->min_flt;
+ 			maj_flt += sig->maj_flt;
+ 			utime = cputime_add(utime, sig->utime);
+ 			stime = cputime_add(stime, sig->stime);
++			gtime = cputime_add(gtime, sig->gtime);
+ 		}
+ 
+ 		sid = signal_session(sig);
+ 		pgid = process_group(task);
+ 		ppid = rcu_dereference(task->real_parent)->tgid;
+@@ -454,10 +466,11 @@ static int do_task_stat(struct task_stru
+ 	if (!whole) {
+ 		min_flt = task->min_flt;
+ 		maj_flt = task->maj_flt;
+ 		utime = task_utime(task);
+ 		stime = task_stime(task);
++		gtime = task_gtime(task);
+ 	}
+ 
+ 	/* scale priority and nice values from timeslices to -20..20 */
+ 	/* to make it look like a "normal" Unix priority/nice value  */
+ 	priority = task_prio(task);
+@@ -471,11 +484,11 @@ static int do_task_stat(struct task_stru
+ 	/* convert nsec -> ticks */
+ 	start_time = nsec_to_clock_t(start_time);
+ 
+ 	res = sprintf(buffer, "%d (%s) %c %d %d %d %d %d %u %lu \
+ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
+-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu\n",
++%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
+ 		task->pid,
+ 		tcomm,
+ 		state,
+ 		ppid,
+ 		pgid,
+@@ -516,11 +529,13 @@ static int do_task_stat(struct task_stru
+ 		0UL,
+ 		task->exit_signal,
+ 		task_cpu(task),
+ 		task->rt_priority,
+ 		task->policy,
+-		(unsigned long long)delayacct_blkio_ticks(task));
++		(unsigned long long)delayacct_blkio_ticks(task),
++		cputime_to_clock_t(gtime),
++		cputime_to_clock_t(cgtime));
+ 	if (mm)
+ 		mmput(mm);
+ 	return res;
+ }
+ 
+--- linux-2.6.23.orig/fs/proc/base.c
++++ linux-2.6.23/fs/proc/base.c
+@@ -302,11 +302,11 @@ static int proc_pid_wchan(struct task_st
+ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
+ {
+ 	return sprintf(buffer, "%llu %llu %lu\n",
+ 			task->sched_info.cpu_time,
+ 			task->sched_info.run_delay,
+-			task->sched_info.pcnt);
++			task->sched_info.pcount);
+ }
+ #endif
+ 
+ /* The badness from the OOM killer */
+ unsigned long badness(struct task_struct *p, unsigned long uptime);
+--- linux-2.6.23.orig/fs/proc/proc_misc.c
++++ linux-2.6.23/fs/proc/proc_misc.c
+@@ -441,20 +441,22 @@ static const struct file_operations proc
+ static int show_stat(struct seq_file *p, void *v)
+ {
+ 	int i;
+ 	unsigned long jif;
+ 	cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
++	cputime64_t guest;
+ 	u64 sum = 0;
+ 	struct timespec boottime;
+ 	unsigned int *per_irq_sum;
+ 
+ 	per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL);
+ 	if (!per_irq_sum)
+ 		return -ENOMEM;
+ 
+ 	user = nice = system = idle = iowait =
+ 		irq = softirq = steal = cputime64_zero;
++	guest = cputime64_zero;
+ 	getboottime(&boottime);
+ 	jif = boottime.tv_sec;
+ 
+ 	for_each_possible_cpu(i) {
+ 		int j;
+@@ -465,26 +467,28 @@ static int show_stat(struct seq_file *p,
+ 		idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
+ 		iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
+ 		irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
+ 		softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
+ 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
++		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+ 		for (j = 0; j < NR_IRQS; j++) {
+ 			unsigned int temp = kstat_cpu(i).irqs[j];
+ 			sum += temp;
+ 			per_irq_sum[j] += temp;
+ 		}
+ 	}
+ 
+-	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu\n",
++	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ 		(unsigned long long)cputime64_to_clock_t(user),
+ 		(unsigned long long)cputime64_to_clock_t(nice),
+ 		(unsigned long long)cputime64_to_clock_t(system),
+ 		(unsigned long long)cputime64_to_clock_t(idle),
+ 		(unsigned long long)cputime64_to_clock_t(iowait),
+ 		(unsigned long long)cputime64_to_clock_t(irq),
+ 		(unsigned long long)cputime64_to_clock_t(softirq),
+-		(unsigned long long)cputime64_to_clock_t(steal));
++		(unsigned long long)cputime64_to_clock_t(steal),
++		(unsigned long long)cputime64_to_clock_t(guest));
+ 	for_each_online_cpu(i) {
+ 
+ 		/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
+ 		user = kstat_cpu(i).cpustat.user;
+ 		nice = kstat_cpu(i).cpustat.nice;
+@@ -492,20 +496,23 @@ static int show_stat(struct seq_file *p,
+ 		idle = kstat_cpu(i).cpustat.idle;
+ 		iowait = kstat_cpu(i).cpustat.iowait;
+ 		irq = kstat_cpu(i).cpustat.irq;
+ 		softirq = kstat_cpu(i).cpustat.softirq;
+ 		steal = kstat_cpu(i).cpustat.steal;
+-		seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n",
++		guest = kstat_cpu(i).cpustat.guest;
++		seq_printf(p,
++			"cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+ 			i,
+ 			(unsigned long long)cputime64_to_clock_t(user),
+ 			(unsigned long long)cputime64_to_clock_t(nice),
+ 			(unsigned long long)cputime64_to_clock_t(system),
+ 			(unsigned long long)cputime64_to_clock_t(idle),
+ 			(unsigned long long)cputime64_to_clock_t(iowait),
+ 			(unsigned long long)cputime64_to_clock_t(irq),
+ 			(unsigned long long)cputime64_to_clock_t(softirq),
+-			(unsigned long long)cputime64_to_clock_t(steal));
++			(unsigned long long)cputime64_to_clock_t(steal),
++			(unsigned long long)cputime64_to_clock_t(guest));
+ 	}
+ 	seq_printf(p, "intr %llu", (unsigned long long)sum);
+ 
+ #ifndef CONFIG_SMP
+ 	/* Touches too many cache lines on SMP setups */
+--- /dev/null
++++ linux-2.6.23/include/linux/cgroup.h
+@@ -0,0 +1,12 @@
++#ifndef _LINUX_CGROUP_H
++#define _LINUX_CGROUP_H
++
++/*
++ * Control groups are not backported - we use a few compatibility
++ * defines to be able to use the upstream sched.c as-is:
++ */
++#define task_pid_nr(task)		(task)->pid
++#define task_pid_vnr(task)		(task)->pid
++#define find_task_by_vpid(pid)		find_task_by_pid(pid)
++
++#endif
+--- linux-2.6.23.orig/include/linux/cpuset.h
++++ linux-2.6.23/include/linux/cpuset.h
+@@ -144,8 +144,13 @@ static inline int cpuset_do_slab_mem_spr
+ 	return 0;
+ }
+ 
+ static inline void cpuset_track_online_nodes(void) {}
+ 
++static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p)
++{
++	return cpu_possible_map;
++}
++
+ #endif /* !CONFIG_CPUSETS */
+ 
+ #endif /* _LINUX_CPUSET_H */
+--- linux-2.6.23.orig/include/linux/kernel.h
++++ linux-2.6.23/include/linux/kernel.h
+@@ -59,10 +59,17 @@ extern const char linux_proc_banner[];
+ #define	KERN_WARNING	"<4>"	/* warning conditions			*/
+ #define	KERN_NOTICE	"<5>"	/* normal but significant condition	*/
+ #define	KERN_INFO	"<6>"	/* informational			*/
+ #define	KERN_DEBUG	"<7>"	/* debug-level messages			*/
+ 
++/*
++ * Annotation for a "continued" line of log printout (only done after a
++ * line that had no enclosing \n). Only to be used by core/arch code
++ * during early bootup (a continued line is not SMP-safe otherwise).
++ */
++#define	KERN_CONT	""
++
+ extern int console_printk[];
+ 
+ #define console_loglevel (console_printk[0])
+ #define default_message_loglevel (console_printk[1])
+ #define minimum_console_loglevel (console_printk[2])
+--- linux-2.6.23.orig/include/linux/kernel_stat.h
++++ linux-2.6.23/include/linux/kernel_stat.h
+@@ -21,10 +21,11 @@ struct cpu_usage_stat {
+ 	cputime64_t softirq;
+ 	cputime64_t irq;
+ 	cputime64_t idle;
+ 	cputime64_t iowait;
+ 	cputime64_t steal;
++	cputime64_t guest;
+ };
+ 
+ struct kernel_stat {
+ 	struct cpu_usage_stat	cpustat;
+ 	unsigned int irqs[NR_IRQS];
+@@ -50,9 +51,11 @@ static inline int kstat_irqs(int irq)
+ 
+ 	return sum;
+ }
+ 
+ extern void account_user_time(struct task_struct *, cputime_t);
++extern void account_user_time_scaled(struct task_struct *, cputime_t);
+ extern void account_system_time(struct task_struct *, int, cputime_t);
++extern void account_system_time_scaled(struct task_struct *, cputime_t);
+ extern void account_steal_time(struct task_struct *, cputime_t);
+ 
+ #endif /* _LINUX_KERNEL_STAT_H */
+--- linux-2.6.23.orig/include/linux/nodemask.h
++++ linux-2.6.23/include/linux/nodemask.h
+@@ -336,46 +336,108 @@ static inline void __nodes_remap(nodemas
+ 	if (!nodes_empty(mask))				\
+ 		for ((node) = 0; (node) < 1; (node)++)
+ #endif /* MAX_NUMNODES */
+ 
+ /*
++ * Bitmasks that are kept for all the nodes.
++ */
++enum node_states {
++	N_POSSIBLE,		/* The node could become online at some point */
++	N_ONLINE,		/* The node is online */
++	N_NORMAL_MEMORY,	/* The node has regular memory */
++#ifdef CONFIG_HIGHMEM
++	N_HIGH_MEMORY,		/* The node has regular or high memory */
++#else
++	N_HIGH_MEMORY = N_NORMAL_MEMORY,
++#endif
++	N_CPU,		/* The node has one or more cpus */
++	NR_NODE_STATES
++};
++
++/*
+  * The following particular system nodemasks and operations
+  * on them manage all possible and online nodes.
+  */
+ 
+-extern nodemask_t node_online_map;
+-extern nodemask_t node_possible_map;
++extern nodemask_t node_states[NR_NODE_STATES];
+ 
+ #if MAX_NUMNODES > 1
+-#define num_online_nodes()	nodes_weight(node_online_map)
+-#define num_possible_nodes()	nodes_weight(node_possible_map)
+-#define node_online(node)	node_isset((node), node_online_map)
+-#define node_possible(node)	node_isset((node), node_possible_map)
+-#define first_online_node	first_node(node_online_map)
+-#define next_online_node(nid)	next_node((nid), node_online_map)
++static inline int node_state(int node, enum node_states state)
++{
++	return node_isset(node, node_states[state]);
++}
++
++static inline void node_set_state(int node, enum node_states state)
++{
++	__node_set(node, &node_states[state]);
++}
++
++static inline void node_clear_state(int node, enum node_states state)
++{
++	__node_clear(node, &node_states[state]);
++}
++
++static inline int num_node_state(enum node_states state)
++{
++	return nodes_weight(node_states[state]);
++}
++
++#define for_each_node_state(__node, __state) \
++	for_each_node_mask((__node), node_states[__state])
++
++#define first_online_node	first_node(node_states[N_ONLINE])
++#define next_online_node(nid)	next_node((nid), node_states[N_ONLINE])
++
+ extern int nr_node_ids;
+ #else
+-#define num_online_nodes()	1
+-#define num_possible_nodes()	1
+-#define node_online(node)	((node) == 0)
+-#define node_possible(node)	((node) == 0)
++
++static inline int node_state(int node, enum node_states state)
++{
++	return node == 0;
++}
++
++static inline void node_set_state(int node, enum node_states state)
++{
++}
++
++static inline void node_clear_state(int node, enum node_states state)
++{
++}
++
++static inline int num_node_state(enum node_states state)
++{
++	return 1;
++}
++
++#define for_each_node_state(node, __state) \
++	for ( (node) = 0; (node) == 0; (node) = 1)
++
+ #define first_online_node	0
+ #define next_online_node(nid)	(MAX_NUMNODES)
+ #define nr_node_ids		1
++
+ #endif
+ 
++#define node_online_map 	node_states[N_ONLINE]
++#define node_possible_map 	node_states[N_POSSIBLE]
++
+ #define any_online_node(mask)			\
+ ({						\
+ 	int node;				\
+ 	for_each_node_mask(node, (mask))	\
+ 		if (node_online(node))		\
+ 			break;			\
+ 	node;					\
+ })
+ 
+-#define node_set_online(node)	   set_bit((node), node_online_map.bits)
+-#define node_set_offline(node)	   clear_bit((node), node_online_map.bits)
++#define num_online_nodes()	num_node_state(N_ONLINE)
++#define num_possible_nodes()	num_node_state(N_POSSIBLE)
++#define node_online(node)	node_state((node), N_ONLINE)
++#define node_possible(node)	node_state((node), N_POSSIBLE)
++
++#define node_set_online(node)	   node_set_state((node), N_ONLINE)
++#define node_set_offline(node)	   node_clear_state((node), N_ONLINE)
+ 
+-#define for_each_node(node)	   for_each_node_mask((node), node_possible_map)
+-#define for_each_online_node(node) for_each_node_mask((node), node_online_map)
++#define for_each_node(node)	   for_each_node_state(node, N_POSSIBLE)
++#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
+ 
+ #endif /* __LINUX_NODEMASK_H */
+--- linux-2.6.23.orig/include/linux/sched.h
++++ linux-2.6.23/include/linux/sched.h
+@@ -1,10 +1,21 @@
+ #ifndef _LINUX_SCHED_H
+ #define _LINUX_SCHED_H
+ 
+ #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
+ 
++/* backporting helper macro: */
++#define cpu_sibling_map(cpu) cpu_sibling_map[cpu]
++
++/*
++ *  * Control groups are not backported - we use a few compatibility
++ *   * defines to be able to use the upstream sched.c as-is:
++ *    */
++#define task_pid_nr(task)               (task)->pid
++#define task_pid_vnr(task)              (task)->pid
++#define find_task_by_vpid(pid)          find_task_by_pid(pid)
++
+ /*
+  * cloning flags:
+  */
+ #define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
+ #define CLONE_VM	0x00000100	/* set if VM shared between processes */
+@@ -84,10 +95,11 @@ struct sched_param {
+ #include <linux/param.h>
+ #include <linux/resource.h>
+ #include <linux/timer.h>
+ #include <linux/hrtimer.h>
+ #include <linux/task_io_accounting.h>
++#include <linux/kobject.h>
+ 
+ #include <asm/processor.h>
+ 
+ struct exec_domain;
+ struct futex_pi_state;
+@@ -133,10 +145,11 @@ extern unsigned long nr_active(void);
+ extern unsigned long nr_iowait(void);
+ extern unsigned long weighted_cpuload(const int cpu);
+ 
+ struct seq_file;
+ struct cfs_rq;
++struct task_group;
+ #ifdef CONFIG_SCHED_DEBUG
+ extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
+ extern void proc_sched_set_task(struct task_struct *p);
+ extern void
+ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+@@ -171,12 +184,11 @@ print_cfs_rq(struct seq_file *m, int cpu
+ #define TASK_TRACED		8
+ /* in tsk->exit_state */
+ #define EXIT_ZOMBIE		16
+ #define EXIT_DEAD		32
+ /* in tsk->state again */
+-#define TASK_NONINTERACTIVE	64
+-#define TASK_DEAD		128
++#define TASK_DEAD		64
+ 
+ #define __set_task_state(tsk, state_value)		\
+ 	do { (tsk)->state = (state_value); } while (0)
+ #define set_task_state(tsk, state_value)		\
+ 	set_mb((tsk)->state, (state_value))
+@@ -276,10 +288,14 @@ static inline void touch_all_softlockup_
+ #endif
+ 
+ 
+ /* Attach to any functions which should be ignored in wchan output. */
+ #define __sched		__attribute__((__section__(".sched.text")))
++
++/* Linker adds these: start and end of __sched functions */
++extern char __sched_text_start[], __sched_text_end[];
++
+ /* Is this address in the __sched functions? */
+ extern int in_sched_functions(unsigned long addr);
+ 
+ #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
+ extern signed long FASTCALL(schedule_timeout(signed long timeout));
+@@ -513,10 +529,12 @@ struct signal_struct {
+ 	 * and for reaped dead child processes forked by this group.
+ 	 * Live threads maintain their own counters and add to these
+ 	 * in __exit_signal, except for the group leader.
+ 	 */
+ 	cputime_t utime, stime, cutime, cstime;
++	cputime_t gtime;
++	cputime_t cgtime;
+ 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
+ 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+ 	unsigned long inblock, oublock, cinblock, coublock;
+ 
+ 	/*
+@@ -593,12 +611,27 @@ struct user_struct {
+ #endif
+ 
+ 	/* Hash table maintenance information */
+ 	struct hlist_node uidhash_node;
+ 	uid_t uid;
++
++#ifdef CONFIG_FAIR_USER_SCHED
++	struct task_group *tg;
++#ifdef CONFIG_SYSFS
++	struct kset kset;
++	struct subsys_attribute user_attr;
++	struct work_struct work;
++#endif
++#endif
+ };
+ 
++#ifdef CONFIG_FAIR_USER_SCHED
++extern int uids_kobject_init(void);
++#else
++static inline int uids_kobject_init(void) { return 0; }
++#endif
++
+ extern struct user_struct *find_user(uid_t);
+ 
+ extern struct user_struct root_user;
+ #define INIT_USER (&root_user)
+ 
+@@ -606,17 +639,21 @@ struct backing_dev_info;
+ struct reclaim_state;
+ 
+ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ struct sched_info {
+ 	/* cumulative counters */
+-	unsigned long pcnt;	      /* # of times run on this cpu */
++	unsigned long pcount;	      /* # of times run on this cpu */
+ 	unsigned long long cpu_time,  /* time spent on the cpu */
+ 			   run_delay; /* time spent waiting on a runqueue */
+ 
+ 	/* timestamps */
+ 	unsigned long long last_arrival,/* when we last ran on a cpu */
+ 			   last_queued;	/* when we were last queued to run */
++#ifdef CONFIG_SCHEDSTATS
++	/* BKL stats */
++	unsigned int bkl_count;
++#endif
+ };
+ #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
+ 
+ #ifdef CONFIG_SCHEDSTATS
+ extern const struct file_operations proc_schedstat_operations;
+@@ -747,43 +784,42 @@ struct sched_domain {
+ 	unsigned int balance_interval;	/* initialise to 1. units in ms. */
+ 	unsigned int nr_balance_failed; /* initialise to 0 */
+ 
+ #ifdef CONFIG_SCHEDSTATS
+ 	/* load_balance() stats */
+-	unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_gained[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES];
+-	unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
++	unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
+ 
+ 	/* Active load balancing */
+-	unsigned long alb_cnt;
+-	unsigned long alb_failed;
+-	unsigned long alb_pushed;
++	unsigned int alb_count;
++	unsigned int alb_failed;
++	unsigned int alb_pushed;
+ 
+ 	/* SD_BALANCE_EXEC stats */
+-	unsigned long sbe_cnt;
+-	unsigned long sbe_balanced;
+-	unsigned long sbe_pushed;
++	unsigned int sbe_count;
++	unsigned int sbe_balanced;
++	unsigned int sbe_pushed;
+ 
+ 	/* SD_BALANCE_FORK stats */
+-	unsigned long sbf_cnt;
+-	unsigned long sbf_balanced;
+-	unsigned long sbf_pushed;
++	unsigned int sbf_count;
++	unsigned int sbf_balanced;
++	unsigned int sbf_pushed;
+ 
+ 	/* try_to_wake_up() stats */
+-	unsigned long ttwu_wake_remote;
+-	unsigned long ttwu_move_affine;
+-	unsigned long ttwu_move_balance;
++	unsigned int ttwu_wake_remote;
++	unsigned int ttwu_move_affine;
++	unsigned int ttwu_move_balance;
+ #endif
+ };
+ 
+-extern int partition_sched_domains(cpumask_t *partition1,
+-				    cpumask_t *partition2);
++extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
+ 
+ #endif	/* CONFIG_SMP */
+ 
+ /*
+  * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
+@@ -851,27 +887,32 @@ struct uts_namespace;
+ 
+ struct rq;
+ struct sched_domain;
+ 
+ struct sched_class {
+-	struct sched_class *next;
++	const struct sched_class *next;
+ 
+ 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
+ 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
+-	void (*yield_task) (struct rq *rq, struct task_struct *p);
++	void (*yield_task) (struct rq *rq);
+ 
+ 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
+ 
+ 	struct task_struct * (*pick_next_task) (struct rq *rq);
+ 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+ 
++#ifdef CONFIG_SMP
+ 	unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
+-			struct rq *busiest,
+-			unsigned long max_nr_move, unsigned long max_load_move,
++			struct rq *busiest, unsigned long max_load_move,
+ 			struct sched_domain *sd, enum cpu_idle_type idle,
+ 			int *all_pinned, int *this_best_prio);
+ 
++	int (*move_one_task) (struct rq *this_rq, int this_cpu,
++			      struct rq *busiest, struct sched_domain *sd,
++			      enum cpu_idle_type idle);
++#endif
++
+ 	void (*set_curr_task) (struct rq *rq);
+ 	void (*task_tick) (struct rq *rq, struct task_struct *p);
+ 	void (*task_new) (struct rq *rq, struct task_struct *p);
+ };
+ 
+@@ -885,46 +926,52 @@ struct load_weight {
+  * Current field usage histogram:
+  *
+  *     4 se->block_start
+  *     4 se->run_node
+  *     4 se->sleep_start
+- *     4 se->sleep_start_fair
+  *     6 se->load.weight
+- *     7 se->delta_fair
+- *    15 se->wait_runtime
+  */
+ struct sched_entity {
+-	long			wait_runtime;
+-	unsigned long		delta_fair_run;
+-	unsigned long		delta_fair_sleep;
+-	unsigned long		delta_exec;
+-	s64			fair_key;
+ 	struct load_weight	load;		/* for load-balancing */
+ 	struct rb_node		run_node;
+ 	unsigned int		on_rq;
+ 
+ 	u64			exec_start;
+ 	u64			sum_exec_runtime;
++	u64			vruntime;
+ 	u64			prev_sum_exec_runtime;
+-	u64			wait_start_fair;
+-	u64			sleep_start_fair;
+ 
+ #ifdef CONFIG_SCHEDSTATS
+ 	u64			wait_start;
+ 	u64			wait_max;
+-	s64			sum_wait_runtime;
+ 
+ 	u64			sleep_start;
+ 	u64			sleep_max;
+ 	s64			sum_sleep_runtime;
+ 
+ 	u64			block_start;
+ 	u64			block_max;
+ 	u64			exec_max;
++	u64			slice_max;
+ 
+-	unsigned long		wait_runtime_overruns;
+-	unsigned long		wait_runtime_underruns;
++	u64			nr_migrations;
++	u64			nr_migrations_cold;
++	u64			nr_failed_migrations_affine;
++	u64			nr_failed_migrations_running;
++	u64			nr_failed_migrations_hot;
++	u64			nr_forced_migrations;
++	u64			nr_forced2_migrations;
++
++	u64			nr_wakeups;
++	u64			nr_wakeups_sync;
++	u64			nr_wakeups_migrate;
++	u64			nr_wakeups_local;
++	u64			nr_wakeups_remote;
++	u64			nr_wakeups_affine;
++	u64			nr_wakeups_affine_attempts;
++	u64			nr_wakeups_passive;
++	u64			nr_wakeups_idle;
+ #endif
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	struct sched_entity	*parent;
+ 	/* rq on which this entity is (to be) queued: */
+@@ -949,11 +996,11 @@ struct task_struct {
+ #endif
+ #endif
+ 
+ 	int prio, static_prio, normal_prio;
+ 	struct list_head run_list;
+-	struct sched_class *sched_class;
++	const struct sched_class *sched_class;
+ 	struct sched_entity se;
+ 
+ #ifdef CONFIG_PREEMPT_NOTIFIERS
+ 	/* list of struct preempt_notifier: */
+ 	struct hlist_head preempt_notifiers;
+@@ -1019,11 +1066,12 @@ struct task_struct {
+ 	struct completion *vfork_done;		/* for vfork() */
+ 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
+ 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
+ 
+ 	unsigned int rt_priority;
+-	cputime_t utime, stime;
++	cputime_t utime, stime, utimescaled, stimescaled;
++	cputime_t gtime;
+ 	cputime_t prev_utime, prev_stime;
+ 	unsigned long nvcsw, nivcsw; /* context switch counts */
+ 	struct timespec start_time; 		/* monotonic time */
+ 	struct timespec real_start_time;	/* boot based time */
+ /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
+@@ -1312,10 +1360,11 @@ static inline void put_task_struct(struc
+ #define PF_ALIGNWARN	0x00000001	/* Print alignment warning msgs */
+ 					/* Not implemented yet, only for 486*/
+ #define PF_STARTING	0x00000002	/* being created */
+ #define PF_EXITING	0x00000004	/* getting shut down */
+ #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
++#define PF_VCPU		0x00000010	/* I'm a virtual CPU */
+ #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
+ #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
+ #define PF_DUMPCORE	0x00000200	/* dumped core */
+ #define PF_SIGNALED	0x00000400	/* killed by a signal */
+ #define PF_MEMALLOC	0x00000800	/* Allocating memory */
+@@ -1399,19 +1448,30 @@ extern void idle_task_exit(void);
+ static inline void idle_task_exit(void) {}
+ #endif
+ 
+ extern void sched_idle_next(void);
+ 
++#ifdef CONFIG_SCHED_DEBUG
+ extern unsigned int sysctl_sched_latency;
+ extern unsigned int sysctl_sched_min_granularity;
+ extern unsigned int sysctl_sched_wakeup_granularity;
+ extern unsigned int sysctl_sched_batch_wakeup_granularity;
+-extern unsigned int sysctl_sched_stat_granularity;
+-extern unsigned int sysctl_sched_runtime_limit;
+-extern unsigned int sysctl_sched_compat_yield;
+ extern unsigned int sysctl_sched_child_runs_first;
+ extern unsigned int sysctl_sched_features;
++extern unsigned int sysctl_sched_migration_cost;
++extern unsigned int sysctl_sched_nr_migrate;
++#ifdef CONFIG_FAIR_GROUP_SCHED
++extern unsigned int sysctl_sched_min_bal_int_shares;
++extern unsigned int sysctl_sched_max_bal_int_shares;
++#endif
++
++int sched_nr_latency_handler(struct ctl_table *table, int write,
++		struct file *file, void __user *buffer, size_t *length,
++		loff_t *ppos);
++#endif
++
++extern unsigned int sysctl_sched_compat_yield;
+ 
+ #ifdef CONFIG_RT_MUTEXES
+ extern int rt_mutex_getprio(struct task_struct *p);
+ extern void rt_mutex_setprio(struct task_struct *p, int prio);
+ extern void rt_mutex_adjust_pi(struct task_struct *p);
+@@ -1841,10 +1901,22 @@ extern long sched_getaffinity(pid_t pid,
+ 
+ extern int sched_mc_power_savings, sched_smt_power_savings;
+ 
+ extern void normalize_rt_tasks(void);
+ 
++#ifdef CONFIG_FAIR_GROUP_SCHED
++
++extern struct task_group init_task_group;
author	Marcin Juszkiewicz <marcin@juszkiewicz.com.pl>	2009-10-21 12:22:20 +0200
committer	Marcin Juszkiewicz <marcin@juszkiewicz.com.pl>	2009-12-14 11:34:58 +0100
commit	9d1b79b7848e13e1bf80b736671f76144cc508d4 (patch)
tree	c5e1ff0f4b3c2f08444666fdf8348aa8610fc98d
parent	575cf43aa9df4192aa9125258545e7943a45f4d5 (diff)