diff options
Diffstat (limited to 'recipes/gcc/gcc-4.4.1/fedora/gcc43-libgomp-speedup.patch')
-rw-r--r-- | recipes/gcc/gcc-4.4.1/fedora/gcc43-libgomp-speedup.patch | 2797 |
1 files changed, 2797 insertions, 0 deletions
diff --git a/recipes/gcc/gcc-4.4.1/fedora/gcc43-libgomp-speedup.patch b/recipes/gcc/gcc-4.4.1/fedora/gcc43-libgomp-speedup.patch new file mode 100644 index 0000000000..da85e556ec --- /dev/null +++ b/recipes/gcc/gcc-4.4.1/fedora/gcc43-libgomp-speedup.patch @@ -0,0 +1,2797 @@ +2008-03-28 Jakub Jelinek <jakub@redhat.com> + + * config/linux/sparc/futex.h (atomic_write_barrier): Fix membar + argument. + +2008-03-27 Jakub Jelinek <jakub@redhat.com> + + * libgomp.h (struct gomp_team_state): Remove single_count field + ifndef HAVE_SYNC_BUILTINS. + (struct gomp_team): Likewise. Add work_share_list_free_lock + ifndef HAVE_SYNC_BUILTINS. + * team.c (gomp_new_team): If HAVE_SYNC_BUILTINS is not defined, + don't initialize single_count, but instead initialize + work_share_list_free_lock. + (free_team): Destroy work_share_list_free_lock ifndef + HAVE_SYNC_BUILTINS. + (gomp_team_start): Don't initialize ts.single_count ifndef + HAVE_SYNC_BUILTINS. + * work.c (alloc_work_share, free_work_share): Use + work_share_list_free_lock instead of atomic chaining ifndef + HAVE_SYNC_BUILTINS. + +2008-03-26 Jakub Jelinek <jakub@redhat.com> + + * loop.c (gomp_loop_init): Fix GFS_DYNAMIC ws->mode setting. + * testsuite/libgomp.c/loop-4.c: New test. + + * libgomp.h (struct gomp_team_state): Add single_count field. + (struct gomp_team): Likewise. + * team.c (gomp_new_team): Clear single_count. + (gomp_team_start): Likewise. + * single.c (GOMP_single_start): Rewritten if HAVE_SYNC_BUILTINS. + +2008-03-25 Jakub Jelinek <jakub@redhat.com> + + * team.c (gomp_thread_start): Don't clear ts.static_trip here. + * loop.c (gomp_loop_static_start, gomp_loop_dynamic_start): Clear + ts.static_trip here. + * work.c (gomp_work_share_start): Don't clear ts.static_trip here. + +2008-03-21 Jakub Jelinek <jakub@redhat.com> + + * libgomp.h: Include ptrlock.h. + (struct gomp_work_share): Reshuffle fields. Add next_alloc, + next_ws, next_free and inline_ordered_team_ids fields, change + ordered_team_ids into pointer from flexible array member. + (struct gomp_team_state): Add last_work_share field, remove + work_share_generation. + (struct gomp_team): Remove work_share_lock, generation_mask, + oldest_live_gen, num_live_gen and init_work_shares fields, add + work work_share_list_alloc, work_share_list_free and work_share_chunk + fields. Change work_shares from pointer to pointers into an array. + (gomp_new_team): New prototype. + (gomp_team_start): Change type of last argument. + (gomp_new_work_share): Removed. + (gomp_init_work_share, gomp_fini_work_share): New prototypes. + (gomp_work_share_init_done): New static inline. + * team.c (gomp_thread_start): Clear ts.last_work_share, don't clear + ts.work_share_generation. + (new_team): Removed. + (gomp_new_team): New function. + (free_team): Free gomp_work_share blocks chained through next_alloc, + instead of freeing work_shares and destroying work_share_lock. + (gomp_team_start): Change last argument from ws to team, don't create + new team, set ts.work_share to &team->work_shares[0] and clear + ts.last_work_share. Don't clear ts.work_share_generation. + (gomp_team_end): Call gomp_fini_work_share. + * work.c (gomp_new_work_share): Removed. + (alloc_work_share, gomp_init_work_share, gomp_fini_work_share): New + functions. + (free_work_share): Add team argument. Call gomp_fini_work_share + and then either free ws if orphaned, or put it into + work_share_list_free list of the current team. + (gomp_work_share_start, gomp_work_share_end, + gomp_work_share_end_nowait): Rewritten. + * sections.c (GOMP_sections_start): Call gomp_work_share_init_done + after gomp_sections_init. If HAVE_SYNC_BUILTINS, call + gomp_iter_dynamic_next instead of the _locked variant and don't take + lock around it, otherwise acquire it before calling + gomp_iter_dynamic_next_locked. + (GOMP_sections_next): If HAVE_SYNC_BUILTINS, call + gomp_iter_dynamic_next instead of the _locked variant and don't take + lock around it. + (GOMP_parallel_sections_start): Call gomp_new_team instead of + gomp_new_work_share. Call gomp_sections_init on &team->work_shares[0]. + Adjust gomp_team_start caller. + * loop.c (gomp_loop_static_start, gomp_loop_ordered_static_start): Call + gomp_work_share_init_done after gomp_loop_init. Don't unlock ws->lock. + (gomp_loop_dynamic_start, gomp_loop_guided_start): Call + gomp_work_share_init_done after gomp_loop_init. If HAVE_SYNC_BUILTINS, + don't unlock ws->lock, otherwise lock it. + (gomp_loop_ordered_dynamic_start, gomp_loop_ordered_guided_start): Call + gomp_work_share_init_done after gomp_loop_init. Lock ws->lock. + (gomp_parallel_loop_start): Call gomp_new_team instead of + gomp_new_work_share. Call gomp_loop_init on &team->work_shares[0]. + Adjust gomp_team_start caller. + * single.c (GOMP_single_start, GOMP_single_copy_start): Call + gomp_work_share_init_done if gomp_work_share_start returned true. + Don't unlock ws->lock. + * parallel.c (GOMP_parallel_start): Call gomp_new_team and pass that + as last argument to gomp_team_start. + * config/linux/ptrlock.c: New file. + * config/linux/ptrlock.h: New file. + * config/posix/ptrlock.c: New file. + * config/posix/ptrlock.h: New file. + * Makefile.am (libgomp_la_SOURCES): Add ptrlock.c. + * Makefile.in: Regenerated. + * testsuite/Makefile.in: Regenerated. + +2008-03-19 Jakub Jelinek <jakub@redhat.com> + + * libgomp.h (gomp_active_wait_policy): Remove decl. + (gomp_throttled_spin_count_var, gomp_available_cpus, + gomp_managed_threads): New extern decls. + * team.c (gomp_team_start, gomp_team_end): If number of threads + changed, adjust atomically gomp_managed_threads. + * env.c (gomp_active_wait_policy, gomp_block_time_var): Remove. + (gomp_throttled_spin_count_var, gomp_available_cpus, + gomp_managed_threads): New variables. + (parse_millis): Removed. + (parse_spincount): New function. + (parse_wait_policy): Return -1/0/1 instead of setting + gomp_active_wait_policy. + (initialize_env): Call gomp_init_num_threads unconditionally. + Initialize gomp_available_cpus. Call parse_spincount instead + of parse_millis, initialize gomp_{,throttled_}spin_count_var + depending on presence and value of OMP_WAIT_POLICY and + GOMP_SPINCOUNT env vars. + * config/linux/wait.h (do_wait): Use gomp_throttled_spin_count_var + instead of gomp_spin_count_var if gomp_managed_threads > + gomp_available_cpus. + + * config/linux/wait.h: Include errno.h. + (FUTEX_WAIT, FUTEX_WAKE, FUTEX_PRIVATE_FLAG): Define. + (gomp_futex_wake, gomp_futex_wait): New extern decls. + * config/linux/mutex.c (gomp_futex_wake, gomp_futex_wait): New + variables. + * config/linux/powerpc/futex.h (FUTEX_WAIT, FUTEX_WAKE): Remove. + (sys_futex0): Return error code. + (futex_wake, futex_wait): If ENOSYS was returned, clear + FUTEX_PRIVATE_FLAG in gomp_futex_wa{ke,it} and retry. + * config/linux/alpha/futex.h (FUTEX_WAIT, FUTEX_WAKE): Remove. + (futex_wake, futex_wait): If ENOSYS was returned, clear + FUTEX_PRIVATE_FLAG in gomp_futex_wa{ke,it} and retry. + * config/linux/x86/futex.h (FUTEX_WAIT, FUTEX_WAKE): Remove. + (sys_futex0): Return error code. + (futex_wake, futex_wait): If ENOSYS was returned, clear + FUTEX_PRIVATE_FLAG in gomp_futex_wa{ke,it} and retry. + * config/linux/s390/futex.h (FUTEX_WAIT, FUTEX_WAKE): Remove. + (sys_futex0): Return error code. + (futex_wake, futex_wait): If ENOSYS was returned, clear + FUTEX_PRIVATE_FLAG in gomp_futex_wa{ke,it} and retry. + * config/linux/ia64/futex.h (FUTEX_WAIT, FUTEX_WAKE): Remove. + (sys_futex0): Return error code. + (futex_wake, futex_wait): If ENOSYS was returned, clear + FUTEX_PRIVATE_FLAG in gomp_futex_wa{ke,it} and retry. + * config/linux/sparc/futex.h (FUTEX_WAIT, FUTEX_WAKE): Remove. + (sys_futex0): Return error code. + (futex_wake, futex_wait): If ENOSYS was returned, clear + FUTEX_PRIVATE_FLAG in gomp_futex_wa{ke,it} and retry. + +2008-03-18 Jakub Jelinek <jakub@redhat.com> + + * libgomp.h (struct gomp_work_share): Add mode field. Put lock and + next into a different cache line from most of the write-once fields. + * loop.c: Include limits.h. + (gomp_loop_init): For GFS_DYNAMIC, multiply ws->chunk_size by incr. + If adding ws->chunk_size nthreads + 1 times after end won't + overflow, set ws->mode to 1. + * iter.c (gomp_iter_dynamic_next_locked): Don't multiply + ws->chunk_size by incr. + (gomp_iter_dynamic_next): Likewise. If ws->mode, use more efficient + code. + * work.c: Include stddef.h. + (gomp_new_work_share): Use offsetof rather than sizeof. + +2008-03-17 Jakub Jelinek <jakub@redhat.com> + + * libgomp.h (struct gomp_team): Change ordered_release field + into gomp_sem_t ** from flexible array member. Add implicit_task + and initial_work_shares fields. + (gomp_new_task): Removed. + (gomp_init_task): New prototype. + * team.c (new_team): Allocate implicit_task for each thread + and initial work_shares together with gomp_team allocation. + (free_team): Only free work_shares if it is not init_work_shares. + (gomp_team_start): Use gomp_init_task instead of gomp_new_task, + set thr->task to the corresponding implicit_task array entry. + * task.c (gomp_new_task): Removed. + (gomp_init_task): New function. + (gomp_end_task): Don't free the task. + (GOMP_task): Allocate struct gomp_task on the stack, call + gomp_init_task rather than gomp_new_task. + * work.c (gomp_work_share_start): If work_shares == + init_work_shares, gomp_malloc + memcpy rather than gomp_realloc. + +2008-03-15 Jakub Jelinek <jakub@redhat.com> + Ulrich Drepper <drepper@redhat.com> + + * config/linux/bar.h (gomp_barrier_state_t): Rewritten. + (gomp_barrier_state_t): Change to unsigned int. + (gomp_barrier_init, gomp_barrier_reinit, gomp_barrier_destroy, + gomp_barrier_wait_start, gomp_barrier_last_thread): Rewritten. + (gomp_barrier_wait_last): Prototype rather than inline. + * config/linux/bar.c (gomp_barrier_wait_end): Rewritten. + (gomp_barrier_wait_last): New function. + +2008-03-15 Jakub Jelinek <jakub@redhat.com> + + * team.c (gomp_thread_start): Use gomp_barrier_wait_last instead + of gomp_barrier_wait. + * env.c (gomp_block_time_var, gomp_spin_count_var): New variables. + (parse_millis): New function. + (initialize_env): Handle GOMP_BLOCKTIME env var. + * libgomp.h (struct gomp_team): Move close to the end of the struct. + (gomp_spin_count_var): New extern var decl. + * work.c (gomp_work_share_end): Use gomp_barrier_state_t bstate + var instead of bool last, call gomp_barrier_last_thread to check + for last thread, pass bstate to gomp_barrier_wait_end. + * config/linux/wait.h: New file. + * config/linux/mutex.c: Include wait.h instead of libgomp.h and + futex.h. + (gomp_mutex_lock_slow): Call do_wait instead of futex_wait. + * config/linux/bar.c: Include wait.h instead of libgomp.h and + futex.h. + (gomp_barrier_wait_end): Change second argument to + gomp_barrier_state_t. Call do_wait instead of futex_wait. + * config/linux/sem.c: Include wait.h instead of libgomp.h and + futex.h. + (gomp_sem_wait_slow): Call do_wait instead of futex_wait. + * config/linux/lock.c: Include wait.h instead of libgomp.h and + futex.h. + (gomp_set_nest_lock_25): Call do_wait instead of futex_wait. + * config/linux/affinity.c: Assume HAVE_SYNC_BUILTINS. + * config/linux/bar.h (gomp_barrier_state_t): New typedef. + (gomp_barrier_wait_end): Change second argument to + gomp_barrier_state_t. + (gomp_barrier_wait_start): Return gomp_barrier_state_t. + (gomp_barrier_last_thread, gomp_barrier_wait_last): New static + inlines. + * config/linux/powerpc/futex.h (cpu_relax, atomic_write_barrier): New + static inlines. + * config/linux/alpha/futex.h (cpu_relax, atomic_write_barrier): + Likewise. + * config/linux/x86/futex.h (cpu_relax, atomic_write_barrier): + Likewise. + * config/linux/s390/futex.h (cpu_relax, atomic_write_barrier): + Likewise. + * config/linux/ia64/futex.h (cpu_relax, atomic_write_barrier): + Likewise. + * config/linux/sparc/futex.h (cpu_relax, atomic_write_barrier): + Likewise. + * config/posix/bar.c (gomp_barrier_wait_end): Change second argument + to gomp_barrier_state_t. + * config/posix/bar.h (gomp_barrier_state_t): New typedef. + (gomp_barrier_wait_end): Change second argument to + gomp_barrier_state_t. + (gomp_barrier_wait_start): Return gomp_barrier_state_t. + (gomp_barrier_last_thread, gomp_barrier_wait_last): New static + inlines. + +--- libgomp/parallel.c.jj 2007-12-07 14:41:01.000000000 +0100 ++++ libgomp/parallel.c 2008-03-26 15:32:06.000000000 +0100 +@@ -68,7 +68,7 @@ void + GOMP_parallel_start (void (*fn) (void *), void *data, unsigned num_threads) + { + num_threads = gomp_resolve_num_threads (num_threads); +- gomp_team_start (fn, data, num_threads, NULL); ++ gomp_team_start (fn, data, num_threads, gomp_new_team (num_threads)); + } + + void +--- libgomp/sections.c.jj 2007-12-07 14:41:01.000000000 +0100 ++++ libgomp/sections.c 2008-03-26 15:33:06.000000000 +0100 +@@ -59,14 +59,24 @@ GOMP_sections_start (unsigned count) + long s, e, ret; + + if (gomp_work_share_start (false)) +- gomp_sections_init (thr->ts.work_share, count); ++ { ++ gomp_sections_init (thr->ts.work_share, count); ++ gomp_work_share_init_done (); ++ } + ++#ifdef HAVE_SYNC_BUILTINS ++ if (gomp_iter_dynamic_next (&s, &e)) ++ ret = s; ++ else ++ ret = 0; ++#else ++ gomp_mutex_lock (&thr->ts.work_share->lock); + if (gomp_iter_dynamic_next_locked (&s, &e)) + ret = s; + else + ret = 0; +- + gomp_mutex_unlock (&thr->ts.work_share->lock); ++#endif + + return ret; + } +@@ -83,15 +93,23 @@ GOMP_sections_start (unsigned count) + unsigned + GOMP_sections_next (void) + { +- struct gomp_thread *thr = gomp_thread (); + long s, e, ret; + ++#ifdef HAVE_SYNC_BUILTINS ++ if (gomp_iter_dynamic_next (&s, &e)) ++ ret = s; ++ else ++ ret = 0; ++#else ++ struct gomp_thread *thr = gomp_thread (); ++ + gomp_mutex_lock (&thr->ts.work_share->lock); + if (gomp_iter_dynamic_next_locked (&s, &e)) + ret = s; + else + ret = 0; + gomp_mutex_unlock (&thr->ts.work_share->lock); ++#endif + + return ret; + } +@@ -103,15 +121,15 @@ void + GOMP_parallel_sections_start (void (*fn) (void *), void *data, + unsigned num_threads, unsigned count) + { +- struct gomp_work_share *ws; ++ struct gomp_team *team; + + num_threads = gomp_resolve_num_threads (num_threads); + if (gomp_dyn_var && num_threads > count) + num_threads = count; + +- ws = gomp_new_work_share (false, num_threads); +- gomp_sections_init (ws, count); +- gomp_team_start (fn, data, num_threads, ws); ++ team = gomp_new_team (num_threads); ++ gomp_sections_init (&team->work_shares[0], count); ++ gomp_team_start (fn, data, num_threads, team); + } + + /* The GOMP_section_end* routines are called after the thread is told +--- libgomp/env.c.jj 2007-12-07 14:41:01.000000000 +0100 ++++ libgomp/env.c 2008-03-26 16:40:26.000000000 +0100 +@@ -44,6 +44,11 @@ enum gomp_schedule_type gomp_run_sched_v + unsigned long gomp_run_sched_chunk = 1; + unsigned short *gomp_cpu_affinity; + size_t gomp_cpu_affinity_len; ++#ifndef HAVE_SYNC_BUILTINS ++gomp_mutex_t gomp_remaining_threads_lock; ++#endif ++unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1; ++unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var; + + /* Parse the OMP_SCHEDULE environment variable. */ + +@@ -147,6 +152,79 @@ parse_unsigned_long (const char *name, u + return false; + } + ++/* Parse the GOMP_SPINCOUNT environment varible. Return true if one was ++ present and it was successfully parsed. */ ++ ++static bool ++parse_spincount (const char *name, unsigned long long *pvalue) ++{ ++ char *env, *end; ++ unsigned long long value, mult = 1; ++ ++ env = getenv (name); ++ if (env == NULL) ++ return false; ++ ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env == '\0') ++ goto invalid; ++ ++ if (strncasecmp (env, "infinite", 8) == 0 ++ || strncasecmp (env, "infinity", 8) == 0) ++ { ++ value = ~0ULL; ++ end = env + 8; ++ goto check_tail; ++ } ++ ++ errno = 0; ++ value = strtoull (env, &end, 10); ++ if (errno) ++ goto invalid; ++ ++ while (isspace ((unsigned char) *end)) ++ ++end; ++ if (*end != '\0') ++ { ++ switch (tolower (*end)) ++ { ++ case 'k': ++ mult = 1000LL; ++ break; ++ case 'm': ++ mult = 1000LL * 1000LL; ++ break; ++ case 'g': ++ mult = 1000LL * 1000LL * 1000LL; ++ break; ++ case 't': ++ mult = 1000LL * 1000LL * 1000LL * 1000LL; ++ break; ++ default: ++ goto invalid; ++ } ++ ++end; ++ check_tail: ++ while (isspace ((unsigned char) *end)) ++ ++end; ++ if (*end != '\0') ++ goto invalid; ++ } ++ ++ if (value > ~0ULL / mult) ++ value = ~0ULL; ++ else ++ value *= mult; ++ ++ *pvalue = value; ++ return true; ++ ++ invalid: ++ gomp_error ("Invalid value for environment variable %s", name); ++ return false; ++} ++ + /* Parse a boolean value for environment variable NAME and store the + result in VALUE. */ + +@@ -281,10 +359,25 @@ initialize_env (void) + parse_schedule (); + parse_boolean ("OMP_DYNAMIC", &gomp_dyn_var); + parse_boolean ("OMP_NESTED", &gomp_nest_var); ++ gomp_init_num_threads (); ++ gomp_available_cpus = gomp_nthreads_var; + if (!parse_unsigned_long ("OMP_NUM_THREADS", &gomp_nthreads_var)) +- gomp_init_num_threads (); ++ gomp_nthreads_var = gomp_available_cpus; + if (parse_affinity ()) + gomp_init_affinity (); ++ if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var)) ++ { ++ /* Using a rough estimation of 100000 spins per msec, ++ use 200 msec blocking. ++ Depending on the CPU speed, this can be e.g. 5 times longer ++ or 5 times shorter. */ ++ gomp_spin_count_var = 20000000LL; ++ } ++ /* gomp_throttled_spin_count_var is used when there are more libgomp ++ managed threads than available CPUs. Use very short spinning. */ ++ gomp_throttled_spin_count_var = 100LL; ++ if (gomp_throttled_spin_count_var > gomp_spin_count_var) ++ gomp_throttled_spin_count_var = gomp_spin_count_var; + + /* Not strictly environment related, but ordering constructors is tricky. */ + pthread_attr_init (&gomp_thread_attr); +--- libgomp/libgomp.h.jj 2007-12-07 14:41:01.000000000 +0100 ++++ libgomp/libgomp.h 2008-03-27 12:21:51.000000000 +0100 +@@ -50,6 +50,7 @@ + #include "sem.h" + #include "mutex.h" + #include "bar.h" ++#include "ptrlock.h" + + + /* This structure contains the data to control one work-sharing construct, +@@ -70,6 +71,8 @@ struct gomp_work_share + If this is a SECTIONS construct, this value will always be DYNAMIC. */ + enum gomp_schedule_type sched; + ++ int mode; ++ + /* This is the chunk_size argument to the SCHEDULE clause. */ + long chunk_size; + +@@ -81,17 +84,38 @@ struct gomp_work_share + is always 1. */ + long incr; + +- /* This lock protects the update of the following members. */ +- gomp_mutex_t lock; ++ /* This is a circular queue that details which threads will be allowed ++ into the ordered region and in which order. When a thread allocates ++ iterations on which it is going to work, it also registers itself at ++ the end of the array. When a thread reaches the ordered region, it ++ checks to see if it is the one at the head of the queue. If not, it ++ blocks on its RELEASE semaphore. */ ++ unsigned *ordered_team_ids; + +- union { +- /* This is the next iteration value to be allocated. In the case of +- GFS_STATIC loops, this the iteration start point and never changes. */ +- long next; ++ /* This is the number of threads that have registered themselves in ++ the circular queue ordered_team_ids. */ ++ unsigned ordered_num_used; + +- /* This is the returned data structure for SINGLE COPYPRIVATE. */ +- void *copyprivate; +- }; ++ /* This is the team_id of the currently acknowledged owner of the ordered ++ section, or -1u if the ordered section has not been acknowledged by ++ any thread. This is distinguished from the thread that is *allowed* ++ to take the section next. */ ++ unsigned ordered_owner; ++ ++ /* This is the index into the circular queue ordered_team_ids of the ++ current thread that's allowed into the ordered reason. */ ++ unsigned ordered_cur; ++ ++ /* This is a chain of allocated gomp_work_share blocks, valid only ++ in the first gomp_work_share struct in the block. */ ++ struct gomp_work_share *next_alloc; ++ ++ /* The above fields are written once during workshare initialization, ++ or related to ordered worksharing. Make sure the following fields ++ are in a different cache line. */ ++ ++ /* This lock protects the update of the following members. */ ++ gomp_mutex_t lock __attribute__((aligned (64))); + + /* This is the count of the number of threads that have exited the work + share construct. If the construct was marked nowait, they have moved on +@@ -99,27 +123,28 @@ struct gomp_work_share + of the team to exit the work share construct must deallocate it. */ + unsigned threads_completed; + +- /* This is the index into the circular queue ordered_team_ids of the +- current thread that's allowed into the ordered reason. */ +- unsigned ordered_cur; ++ union { ++ /* This is the next iteration value to be allocated. In the case of ++ GFS_STATIC loops, this the iteration start point and never changes. */ ++ long next; + +- /* This is the number of threads that have registered themselves in +- the circular queue ordered_team_ids. */ +- unsigned ordered_num_used; ++ /* This is the returned data structure for SINGLE COPYPRIVATE. */ ++ void *copyprivate; ++ }; + +- /* This is the team_id of the currently acknoledged owner of the ordered +- section, or -1u if the ordered section has not been acknowledged by +- any thread. This is distinguished from the thread that is *allowed* +- to take the section next. */ +- unsigned ordered_owner; ++ union { ++ /* Link to gomp_work_share struct for next work sharing construct ++ encountered after this one. */ ++ gomp_ptrlock_t next_ws; ++ ++ /* gomp_work_share structs are chained in the free work share cache ++ through this. */ ++ struct gomp_work_share *next_free; ++ }; + +- /* This is a circular queue that details which threads will be allowed +- into the ordered region and in which order. When a thread allocates +- iterations on which it is going to work, it also registers itself at +- the end of the array. When a thread reaches the ordered region, it +- checks to see if it is the one at the head of the queue. If not, it +- blocks on its RELEASE semaphore. */ +- unsigned ordered_team_ids[]; ++ /* If only few threads are in the team, ordered_team_ids can point ++ to this array which fills the padding at the end of this struct. */ ++ unsigned inline_ordered_team_ids[0]; + }; + + /* This structure contains all of the thread-local data associated with +@@ -133,21 +158,24 @@ struct gomp_team_state + + /* This is the work share construct which this thread is currently + processing. Recall that with NOWAIT, not all threads may be +- processing the same construct. This value is NULL when there +- is no construct being processed. */ ++ processing the same construct. */ + struct gomp_work_share *work_share; + ++ /* This is the previous work share construct or NULL if there wasn't any. ++ When all threads are done with the current work sharing construct, ++ the previous one can be freed. The current one can't, as its ++ next_ws field is used. */ ++ struct gomp_work_share *last_work_share; ++ + /* This is the ID of this thread within the team. This value is + guaranteed to be between 0 and N-1, where N is the number of + threads in the team. */ + unsigned team_id; + +- /* The work share "generation" is a number that increases by one for +- each work share construct encountered in the dynamic flow of the +- program. It is used to find the control data for the work share +- when encountering it for the first time. This particular number +- reflects the generation of the work_share member of this struct. */ +- unsigned work_share_generation; ++#ifdef HAVE_SYNC_BUILTINS ++ /* Number of single stmts encountered. */ ++ unsigned long single_count; ++#endif + + /* For GFS_RUNTIME loops that resolved to GFS_STATIC, this is the + trip number through the loop. So first time a particular loop +@@ -163,41 +191,53 @@ struct gomp_team_state + + struct gomp_team + { +- /* This lock protects access to the following work shares data structures. */ +- gomp_mutex_t work_share_lock; +- +- /* This is a dynamically sized array containing pointers to the control +- structs for all "live" work share constructs. Here "live" means that +- the construct has been encountered by at least one thread, and not +- completed by all threads. */ +- struct gomp_work_share **work_shares; +- +- /* The work_shares array is indexed by "generation & generation_mask". +- The mask will be 2**N - 1, where 2**N is the size of the array. */ +- unsigned generation_mask; +- +- /* These two values define the bounds of the elements of the work_shares +- array that are currently in use. */ +- unsigned oldest_live_gen; +- unsigned num_live_gen; +- + /* This is the number of threads in the current team. */ + unsigned nthreads; + ++ /* This is number of gomp_work_share structs that have been allocated ++ as a block last time. */ ++ unsigned work_share_chunk; ++ + /* This is the saved team state that applied to a master thread before + the current thread was created. */ + struct gomp_team_state prev_ts; + +- /* This barrier is used for most synchronization of the team. */ +- gomp_barrier_t barrier; +- + /* This semaphore should be used by the master thread instead of its + "native" semaphore in the thread structure. Required for nested + parallels, as the master is a member of two teams. */ + gomp_sem_t master_release; + +- /* This array contains pointers to the release semaphore of the threads +- in the team. */ ++ /* List of gomp_work_share structs chained through next_free fields. ++ This is populated and taken off only by the first thread in the ++ team encountering a new work sharing construct, in a critical ++ section. */ ++ struct gomp_work_share *work_share_list_alloc; ++ ++ /* List of gomp_work_share structs freed by free_work_share. New ++ entries are atomically added to the start of the list, and ++ alloc_work_share can safely only move all but the first entry ++ to work_share_list alloc, as free_work_share can happen concurrently ++ with alloc_work_share. */ ++ struct gomp_work_share *work_share_list_free; ++ ++#ifdef HAVE_SYNC_BUILTINS ++ /* Number of simple single regions encountered by threads in this ++ team. */ ++ unsigned long single_count; ++#else ++ /* Mutex protecting addition of workshares to work_share_list_free. */ ++ gomp_mutex_t work_share_list_free_lock; ++#endif ++ ++ /* This barrier is used for most synchronization of the team. */ ++ gomp_barrier_t barrier; ++ ++ /* Initial work shares, to avoid allocating any gomp_work_share ++ structs in the common case. */ ++ struct gomp_work_share work_shares[8]; ++ ++ /* This is an array with pointers to the release semaphore ++ of the threads in the team. */ + gomp_sem_t *ordered_release[]; + }; + +@@ -242,6 +282,11 @@ extern bool gomp_dyn_var; + extern bool gomp_nest_var; + extern enum gomp_schedule_type gomp_run_sched_var; + extern unsigned long gomp_run_sched_chunk; ++#ifndef HAVE_SYNC_BUILTINS ++extern gomp_mutex_t gomp_remaining_threads_lock; ++#endif ++extern unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var; ++extern unsigned long gomp_available_cpus, gomp_managed_threads; + + /* The attributes to be used during thread creation. */ + extern pthread_attr_t gomp_thread_attr; +@@ -306,17 +351,27 @@ extern unsigned gomp_dynamic_max_threads + + /* team.c */ + ++extern struct gomp_team *gomp_new_team (unsigned); + extern void gomp_team_start (void (*) (void *), void *, unsigned, +- struct gomp_work_share *); ++ struct gomp_team *); + extern void gomp_team_end (void); + + /* work.c */ + +-extern struct gomp_work_share * gomp_new_work_share (bool, unsigned); ++extern void gomp_init_work_share (struct gomp_work_share *, bool, unsigned); ++extern void gomp_fini_work_share (struct gomp_work_share *); + extern bool gomp_work_share_start (bool); + extern void gomp_work_share_end (void); + extern void gomp_work_share_end_nowait (void); + ++static inline void ++gomp_work_share_init_done (void) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ if (__builtin_expect (thr->ts.last_work_share != NULL, 1)) ++ gomp_ptrlock_set (&thr->ts.last_work_share->next_ws, thr->ts.work_share); ++} ++ + #ifdef HAVE_ATTRIBUTE_VISIBILITY + # pragma GCC visibility pop + #endif +--- libgomp/iter.c.jj 2008-03-26 14:48:34.000000000 +0100 ++++ libgomp/iter.c 2008-03-26 15:11:23.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -154,7 +154,7 @@ gomp_iter_dynamic_next_locked (long *pst + if (start == ws->end) + return false; + +- chunk = ws->chunk_size * ws->incr; ++ chunk = ws->chunk_size; + left = ws->end - start; + if (ws->incr < 0) + { +@@ -186,11 +186,38 @@ gomp_iter_dynamic_next (long *pstart, lo + struct gomp_work_share *ws = thr->ts.work_share; + long start, end, nend, chunk, incr; + +- start = ws->next; + end = ws->end; + incr = ws->incr; +- chunk = ws->chunk_size * incr; ++ chunk = ws->chunk_size; ++ ++ if (__builtin_expect (ws->mode, 1)) ++ { ++ long tmp = __sync_fetch_and_add (&ws->next, chunk); ++ if (incr > 0) ++ { ++ if (tmp >= end) ++ return false; ++ nend = tmp + chunk; ++ if (nend > end) ++ nend = end; ++ *pstart = tmp; ++ *pend = nend; ++ return true; ++ } ++ else ++ { ++ if (tmp <= end) ++ return false; ++ nend = tmp + chunk; ++ if (nend < end) ++ nend = end; ++ *pstart = tmp; ++ *pend = nend; ++ return true; ++ } ++ } + ++ start = ws->next; + while (1) + { + long left = end - start; +--- libgomp/work.c.jj 2007-12-07 14:41:01.000000000 +0100 ++++ libgomp/work.c 2008-03-27 12:21:51.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -29,39 +29,138 @@ + of threads. */ + + #include "libgomp.h" ++#include <stddef.h> + #include <stdlib.h> + #include <string.h> + + +-/* Create a new work share structure. */ ++/* Allocate a new work share structure, preferably from current team's ++ free gomp_work_share cache. */ + +-struct gomp_work_share * +-gomp_new_work_share (bool ordered, unsigned nthreads) ++static struct gomp_work_share * ++alloc_work_share (struct gomp_team *team) + { + struct gomp_work_share *ws; +- size_t size; ++ unsigned int i; + +- size = sizeof (*ws); +- if (ordered) +- size += nthreads * sizeof (ws->ordered_team_ids[0]); ++ /* This is called in a critical section. */ ++ if (team->work_share_list_alloc != NULL) ++ { ++ ws = team->work_share_list_alloc; ++ team->work_share_list_alloc = ws->next_free; ++ return ws; ++ } + +- ws = gomp_malloc_cleared (size); +- gomp_mutex_init (&ws->lock); +- ws->ordered_owner = -1; ++#ifdef HAVE_SYNC_BUILTINS ++ ws = team->work_share_list_free; ++ /* We need atomic read from work_share_list_free, ++ as free_work_share can be called concurrently. */ ++ __asm ("" : "+r" (ws)); ++ ++ if (ws && ws->next_free) ++ { ++ struct gomp_work_share *next = ws->next_free; ++ ws->next_free = NULL; ++ team->work_share_list_alloc = next->next_free; ++ return next; ++ } ++#else ++ gomp_mutex_lock (&team->work_share_list_free_lock); ++ ws = team->work_share_list_free; ++ if (ws) ++ { ++ team->work_share_list_alloc = ws->next_free; ++ team->work_share_list_free = NULL; ++ gomp_mutex_unlock (&team->work_share_list_free_lock); ++ return ws; ++ } ++ gomp_mutex_unlock (&team->work_share_list_free_lock); ++#endif + ++ team->work_share_chunk *= 2; ++ ws = gomp_malloc (team->work_share_chunk * sizeof (struct gomp_work_share)); ++ ws->next_alloc = team->work_shares[0].next_alloc; ++ team->work_shares[0].next_alloc = ws; ++ team->work_share_list_alloc = &ws[1]; ++ for (i = 1; i < team->work_share_chunk - 1; i++) ++ ws[i].next_free = &ws[i + 1]; ++ ws[i].next_free = NULL; + return ws; + } + ++/* Initialize an already allocated struct gomp_work_share. ++ This shouldn't touch the next_alloc field. */ ++ ++void ++gomp_init_work_share (struct gomp_work_share *ws, bool ordered, ++ unsigned nthreads) ++{ ++ gomp_mutex_init (&ws->lock); ++ if (__builtin_expect (ordered, 0)) ++ { ++#define INLINE_ORDERED_TEAM_IDS_CNT \ ++ ((sizeof (struct gomp_work_share) \ ++ - offsetof (struct gomp_work_share, inline_ordered_team_ids)) \ ++ / sizeof (((struct gomp_work_share *) 0)->inline_ordered_team_ids[0])) ++ ++ if (nthreads > INLINE_ORDERED_TEAM_IDS_CNT) ++ ws->ordered_team_ids ++ = gomp_malloc (nthreads * sizeof (*ws->ordered_team_ids)); ++ else ++ ws->ordered_team_ids = ws->inline_ordered_team_ids; ++ memset (ws->ordered_team_ids, '\0', ++ nthreads * sizeof (*ws->ordered_team_ids)); ++ ws->ordered_num_used = 0; ++ ws->ordered_owner = -1; ++ ws->ordered_cur = 0; ++ } ++ else ++ ws->ordered_team_ids = NULL; ++ gomp_ptrlock_init (&ws->next_ws, NULL); ++ ws->threads_completed = 0; ++} + +-/* Free a work share structure. */ ++/* Do any needed destruction of gomp_work_share fields before it ++ is put back into free gomp_work_share cache or freed. */ + +-static void +-free_work_share (struct gomp_work_share *ws) ++void ++gomp_fini_work_share (struct gomp_work_share *ws) + { + gomp_mutex_destroy (&ws->lock); +- free (ws); ++ if (ws->ordered_team_ids != ws->inline_ordered_team_ids) ++ free (ws->ordered_team_ids); ++ gomp_ptrlock_destroy (&ws->next_ws); + } + ++/* Free a work share struct, if not orphaned, put it into current ++ team's free gomp_work_share cache. */ ++ ++static inline void ++free_work_share (struct gomp_team *team, struct gomp_work_share *ws) ++{ ++ gomp_fini_work_share (ws); ++ if (__builtin_expect (team == NULL, 0)) ++ free (ws); ++ else ++ { ++ struct gomp_work_share *next_ws; ++#ifdef HAVE_SYNC_BUILTINS ++ do ++ { ++ next_ws = team->work_share_list_free; ++ ws->next_free = next_ws; ++ } ++ while (!__sync_bool_compare_and_swap (&team->work_share_list_free, ++ next_ws, ws)); ++#else ++ gomp_mutex_lock (&team->work_share_list_free_lock); ++ next_ws = team->work_share_list_free; ++ ws->next_free = next_ws; ++ team->work_share_list_free = ws; ++ gomp_mutex_unlock (&team->work_share_list_free_lock); ++#endif ++ } ++} + + /* The current thread is ready to begin the next work sharing construct. + In all cases, thr->ts.work_share is updated to point to the new +@@ -74,71 +173,34 @@ gomp_work_share_start (bool ordered) + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + struct gomp_work_share *ws; +- unsigned ws_index, ws_gen; + + /* Work sharing constructs can be orphaned. */ + if (team == NULL) + { +- ws = gomp_new_work_share (ordered, 1); ++ ws = gomp_malloc (sizeof (*ws)); ++ gomp_init_work_share (ws, ordered, 1); + thr->ts.work_share = ws; +- thr->ts.static_trip = 0; +- gomp_mutex_lock (&ws->lock); +- return true; ++ return ws; + } + +- gomp_mutex_lock (&team->work_share_lock); +- +- /* This thread is beginning its next generation. */ +- ws_gen = ++thr->ts.work_share_generation; +- +- /* If this next generation is not newer than any other generation in +- the team, then simply reference the existing construct. */ +- if (ws_gen - team->oldest_live_gen < team->num_live_gen) ++ ws = thr->ts.work_share; ++ thr->ts.last_work_share = ws; ++ ws = gomp_ptrlock_get (&ws->next_ws); ++ if (ws == NULL) + { +- ws_index = ws_gen & team->generation_mask; +- ws = team->work_shares[ws_index]; ++ /* This thread encountered a new ws first. */ ++ struct gomp_work_share *ws = alloc_work_share (team); ++ gomp_init_work_share (ws, ordered, team->nthreads); + thr->ts.work_share = ws; +- thr->ts.static_trip = 0; +- +- gomp_mutex_lock (&ws->lock); +- gomp_mutex_unlock (&team->work_share_lock); +- +- return false; ++ return true; + } +- +- /* Resize the work shares queue if we've run out of space. */ +- if (team->num_live_gen++ == team->generation_mask) ++ else + { +- team->work_shares = gomp_realloc (team->work_shares, +- 2 * team->num_live_gen +- * sizeof (*team->work_shares)); +- +- /* Unless oldest_live_gen is zero, the sequence of live elements +- wraps around the end of the array. If we do nothing, we break +- lookup of the existing elements. Fix that by unwrapping the +- data from the front to the end. */ +- if (team->oldest_live_gen > 0) +- memcpy (team->work_shares + team->num_live_gen, +- team->work_shares, +- (team->oldest_live_gen & team->generation_mask) +- * sizeof (*team->work_shares)); +- +- team->generation_mask = team->generation_mask * 2 + 1; +- } +- +- ws_index = ws_gen & team->generation_mask; +- ws = gomp_new_work_share (ordered, team->nthreads); +- thr->ts.work_share = ws; +- thr->ts.static_trip = 0; +- team->work_shares[ws_index] = ws; +- +- gomp_mutex_lock (&ws->lock); +- gomp_mutex_unlock (&team->work_share_lock); +- +- return true; ++ thr->ts.work_share = ws; ++ return false; ++ } + } + +- + /* The current thread is done with its current work sharing construct. + This version does imply a barrier at the end of the work-share. */ + +@@ -147,36 +209,28 @@ gomp_work_share_end (void) + { + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; +- struct gomp_work_share *ws = thr->ts.work_share; +- bool last; +- +- thr->ts.work_share = NULL; ++ gomp_barrier_state_t bstate; + + /* Work sharing constructs can be orphaned. */ + if (team == NULL) + { +- free_work_share (ws); ++ free_work_share (NULL, thr->ts.work_share); ++ thr->ts.work_share = NULL; + return; + } + +- last = gomp_barrier_wait_start (&team->barrier); ++ bstate = gomp_barrier_wait_start (&team->barrier); + +- if (last) ++ if (gomp_barrier_last_thread (bstate)) + { +- unsigned ws_index; +- +- ws_index = thr->ts.work_share_generation & team->generation_mask; +- team->work_shares[ws_index] = NULL; +- team->oldest_live_gen++; +- team->num_live_gen = 0; +- +- free_work_share (ws); ++ if (__builtin_expect (thr->ts.last_work_share != NULL, 1)) ++ free_work_share (team, thr->ts.last_work_share); + } + +- gomp_barrier_wait_end (&team->barrier, last); ++ gomp_barrier_wait_end (&team->barrier, bstate); ++ thr->ts.last_work_share = NULL; + } + +- + /* The current thread is done with its current work sharing construct. + This version does NOT imply a barrier at the end of the work-share. */ + +@@ -188,15 +242,17 @@ gomp_work_share_end_nowait (void) + struct gomp_work_share *ws = thr->ts.work_share; + unsigned completed; + +- thr->ts.work_share = NULL; +- + /* Work sharing constructs can be orphaned. */ + if (team == NULL) + { +- free_work_share (ws); ++ free_work_share (NULL, ws); ++ thr->ts.work_share = NULL; + return; + } + ++ if (__builtin_expect (thr->ts.last_work_share == NULL, 0)) ++ return; ++ + #ifdef HAVE_SYNC_BUILTINS + completed = __sync_add_and_fetch (&ws->threads_completed, 1); + #else +@@ -206,18 +262,6 @@ gomp_work_share_end_nowait (void) + #endif + + if (completed == team->nthreads) +- { +- unsigned ws_index; +- +- gomp_mutex_lock (&team->work_share_lock); +- +- ws_index = thr->ts.work_share_generation & team->generation_mask; +- team->work_shares[ws_index] = NULL; +- team->oldest_live_gen++; +- team->num_live_gen--; +- +- gomp_mutex_unlock (&team->work_share_lock); +- +- free_work_share (ws); +- } ++ free_work_share (team, thr->ts.last_work_share); ++ thr->ts.last_work_share = NULL; + } +--- libgomp/single.c.jj 2007-12-07 14:41:01.000000000 +0100 ++++ libgomp/single.c 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -37,10 +37,24 @@ + bool + GOMP_single_start (void) + { ++#ifdef HAVE_SYNC_BUILTINS ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ unsigned long single_count; ++ ++ if (__builtin_expect (team == NULL, 0)) ++ return true; ++ ++ single_count = thr->ts.single_count++; ++ return __sync_bool_compare_and_swap (&team->single_count, single_count, ++ single_count + 1L); ++#else + bool ret = gomp_work_share_start (false); +- gomp_mutex_unlock (&gomp_thread ()->ts.work_share->lock); ++ if (ret) ++ gomp_work_share_init_done (); + gomp_work_share_end_nowait (); + return ret; ++#endif + } + + /* This routine is called when first encountering a SINGLE construct that +@@ -57,10 +71,12 @@ GOMP_single_copy_start (void) + void *ret; + + first = gomp_work_share_start (false); +- gomp_mutex_unlock (&thr->ts.work_share->lock); + + if (first) +- ret = NULL; ++ { ++ gomp_work_share_init_done (); ++ ret = NULL; ++ } + else + { + gomp_barrier_wait (&thr->ts.team->barrier); +--- libgomp/loop.c.jj 2007-12-07 14:41:01.000000000 +0100 ++++ libgomp/loop.c 2008-03-26 18:47:04.000000000 +0100 +@@ -27,8 +27,9 @@ + + /* This file handles the LOOP (FOR/DO) construct. */ + +-#include "libgomp.h" ++#include <limits.h> + #include <stdlib.h> ++#include "libgomp.h" + + + /* Initialize the given work share construct from the given arguments. */ +@@ -44,6 +45,39 @@ gomp_loop_init (struct gomp_work_share * + ? start : end; + ws->incr = incr; + ws->next = start; ++ if (sched == GFS_DYNAMIC) ++ { ++ ws->chunk_size *= incr; ++ ++#ifdef HAVE_SYNC_BUILTINS ++ { ++ /* For dynamic scheduling prepare things to make each iteration ++ faster. */ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ long nthreads = team ? team->nthreads : 1; ++ ++ if (__builtin_expect (incr > 0, 1)) ++ { ++ /* Cheap overflow protection. */ ++ if (__builtin_expect ((nthreads | ws->chunk_size) ++ >= 1UL << (sizeof (long) ++ * __CHAR_BIT__ / 2 - 1), 0)) ++ ws->mode = 0; ++ else ++ ws->mode = ws->end < (LONG_MAX ++ - (nthreads + 1) * ws->chunk_size); ++ } ++ /* Cheap overflow protection. */ ++ else if (__builtin_expect ((nthreads | -ws->chunk_size) ++ >= 1UL << (sizeof (long) ++ * __CHAR_BIT__ / 2 - 1), 0)) ++ ws->mode = 0; ++ else ++ ws->mode = ws->end > (nthreads + 1) * -ws->chunk_size - LONG_MAX; ++ } ++#endif ++ } + } + + /* The *_start routines are called when first encountering a loop construct +@@ -68,10 +102,13 @@ gomp_loop_static_start (long start, long + { + struct gomp_thread *thr = gomp_thread (); + ++ thr->ts.static_trip = 0; + if (gomp_work_share_start (false)) +- gomp_loop_init (thr->ts.work_share, start, end, incr, +- GFS_STATIC, chunk_size); +- gomp_mutex_unlock (&thr->ts.work_share->lock); ++ { ++ gomp_loop_init (thr->ts.work_share, start, end, incr, ++ GFS_STATIC, chunk_size); ++ gomp_work_share_init_done (); ++ } + + return !gomp_iter_static_next (istart, iend); + } +@@ -84,13 +121,16 @@ gomp_loop_dynamic_start (long start, lon + bool ret; + + if (gomp_work_share_start (false)) +- gomp_loop_init (thr->ts.work_share, start, end, incr, +- GFS_DYNAMIC, chunk_size); ++ { ++ gomp_loop_init (thr->ts.work_share, start, end, incr, ++ GFS_DYNAMIC, chunk_size); ++ gomp_work_share_init_done (); ++ } + + #ifdef HAVE_SYNC_BUILTINS +- gomp_mutex_unlock (&thr->ts.work_share->lock); + ret = gomp_iter_dynamic_next (istart, iend); + #else ++ gomp_mutex_lock (&thr->ts.work_share->lock); + ret = gomp_iter_dynamic_next_locked (istart, iend); + gomp_mutex_unlock (&thr->ts.work_share->lock); + #endif +@@ -106,13 +146,16 @@ gomp_loop_guided_start (long start, long + bool ret; + + if (gomp_work_share_start (false)) +- gomp_loop_init (thr->ts.work_share, start, end, incr, +- GFS_GUIDED, chunk_size); ++ { ++ gomp_loop_init (thr->ts.work_share, start, end, incr, ++ GFS_GUIDED, chunk_size); ++ gomp_work_share_init_done (); ++ } + + #ifdef HAVE_SYNC_BUILTINS +- gomp_mutex_unlock (&thr->ts.work_share->lock); + ret = gomp_iter_guided_next (istart, iend); + #else ++ gomp_mutex_lock (&thr->ts.work_share->lock); + ret = gomp_iter_guided_next_locked (istart, iend); + gomp_mutex_unlock (&thr->ts.work_share->lock); + #endif +@@ -149,13 +192,14 @@ gomp_loop_ordered_static_start (long sta + { + struct gomp_thread *thr = gomp_thread (); + ++ thr->ts.static_trip = 0; + if (gomp_work_share_start (true)) + { + gomp_loop_init (thr->ts.work_share, start, end, incr, + GFS_STATIC, chunk_size); + gomp_ordered_static_init (); ++ gomp_work_share_init_done (); + } +- gomp_mutex_unlock (&thr->ts.work_share->lock); + + return !gomp_iter_static_next (istart, iend); + } +@@ -168,8 +212,14 @@ gomp_loop_ordered_dynamic_start (long st + bool ret; + + if (gomp_work_share_start (true)) +- gomp_loop_init (thr->ts.work_share, start, end, incr, +- GFS_DYNAMIC, chunk_size); ++ { ++ gomp_loop_init (thr->ts.work_share, start, end, incr, ++ GFS_DYNAMIC, chunk_size); ++ gomp_mutex_lock (&thr->ts.work_share->lock); ++ gomp_work_share_init_done (); ++ } ++ else ++ gomp_mutex_lock (&thr->ts.work_share->lock); + + ret = gomp_iter_dynamic_next_locked (istart, iend); + if (ret) +@@ -187,8 +237,14 @@ gomp_loop_ordered_guided_start (long sta + bool ret; + + if (gomp_work_share_start (true)) +- gomp_loop_init (thr->ts.work_share, start, end, incr, +- GFS_GUIDED, chunk_size); ++ { ++ gomp_loop_init (thr->ts.work_share, start, end, incr, ++ GFS_GUIDED, chunk_size); ++ gomp_mutex_lock (&thr->ts.work_share->lock); ++ gomp_work_share_init_done (); ++ } ++ else ++ gomp_mutex_lock (&thr->ts.work_share->lock); + + ret = gomp_iter_guided_next_locked (istart, iend); + if (ret) +@@ -375,12 +431,12 @@ gomp_parallel_loop_start (void (*fn) (vo + long incr, enum gomp_schedule_type sched, + long chunk_size) + { +- struct gomp_work_share *ws; ++ struct gomp_team *team; + + num_threads = gomp_resolve_num_threads (num_threads); +- ws = gomp_new_work_share (false, num_threads); +- gomp_loop_init (ws, start, end, incr, sched, chunk_size); +- gomp_team_start (fn, data, num_threads, ws); ++ team = gomp_new_team (num_threads); ++ gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size); ++ gomp_team_start (fn, data, num_threads, team); + } + + void +--- libgomp/Makefile.in.jj 2008-01-10 20:53:47.000000000 +0100 ++++ libgomp/Makefile.in 2008-03-26 18:51:01.000000000 +0100 +@@ -83,7 +83,7 @@ libgomp_la_LIBADD = + am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \ + error.lo iter.lo loop.lo ordered.lo parallel.lo sections.lo \ + single.lo team.lo work.lo lock.lo mutex.lo proc.lo sem.lo \ +- bar.lo time.lo fortran.lo affinity.lo ++ bar.lo ptrlock.lo time.lo fortran.lo affinity.lo + libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS) + DEFAULT_INCLUDES = -I. -I$(srcdir) -I. + depcomp = $(SHELL) $(top_srcdir)/../depcomp +@@ -292,7 +292,7 @@ libgomp_version_info = -version-info $(l + libgomp_la_LDFLAGS = $(libgomp_version_info) $(libgomp_version_script) + libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ + loop.c ordered.c parallel.c sections.c single.c team.c work.c \ +- lock.c mutex.c proc.c sem.c bar.c time.c fortran.c affinity.c ++ lock.c mutex.c proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c + + nodist_noinst_HEADERS = libgomp_f.h + nodist_libsubinclude_HEADERS = omp.h +@@ -434,6 +434,7 @@ distclean-compile: + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/proc.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ptrlock.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sections.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sem.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/single.Plo@am__quote@ +--- libgomp/testsuite/libgomp.c/loop-4.c.jj 2008-03-26 18:47:04.000000000 +0100 ++++ libgomp/testsuite/libgomp.c/loop-4.c 2008-03-26 18:47:04.000000000 +0100 +@@ -0,0 +1,28 @@ ++/* { dg-do run } */ ++ ++extern void abort (void); ++ ++int ++main (void) ++{ ++ int e = 0; ++#pragma omp parallel num_threads (4) reduction(+:e) ++ { ++ long i; ++ #pragma omp for schedule(dynamic,1) ++ for (i = __LONG_MAX__ - 30001; i <= __LONG_MAX__ - 10001; i += 10000) ++ if (i != __LONG_MAX__ - 30001 ++ && i != __LONG_MAX__ - 20001 ++ && i != __LONG_MAX__ - 10001) ++ e = 1; ++ #pragma omp for schedule(dynamic,1) ++ for (i = -__LONG_MAX__ + 30000; i >= -__LONG_MAX__ + 10000; i -= 10000) ++ if (i != -__LONG_MAX__ + 30000 ++ && i != -__LONG_MAX__ + 20000 ++ && i != -__LONG_MAX__ + 10000) ++ e = 1; ++ } ++ if (e) ++ abort (); ++ return 0; ++} +--- libgomp/Makefile.am.jj 2007-12-07 14:41:01.000000000 +0100 ++++ libgomp/Makefile.am 2008-03-26 15:15:19.000000000 +0100 +@@ -31,7 +31,7 @@ libgomp_la_LDFLAGS = $(libgomp_version_i + + libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ + loop.c ordered.c parallel.c sections.c single.c team.c work.c \ +- lock.c mutex.c proc.c sem.c bar.c time.c fortran.c affinity.c ++ lock.c mutex.c proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c + + nodist_noinst_HEADERS = libgomp_f.h + nodist_libsubinclude_HEADERS = omp.h +--- libgomp/team.c.jj 2007-12-07 14:41:01.000000000 +0100 ++++ libgomp/team.c 2008-03-27 12:22:26.000000000 +0100 +@@ -94,7 +94,7 @@ gomp_thread_start (void *xdata) + { + gomp_barrier_wait (&thr->ts.team->barrier); + local_fn (local_data); +- gomp_barrier_wait (&thr->ts.team->barrier); ++ gomp_barrier_wait_last (&thr->ts.team->barrier); + } + else + { +@@ -114,11 +114,10 @@ gomp_thread_start (void *xdata) + thr->data = NULL; + thr->ts.team = NULL; + thr->ts.work_share = NULL; ++ thr->ts.last_work_share = NULL; + thr->ts.team_id = 0; +- thr->ts.work_share_generation = 0; +- thr->ts.static_trip = 0; + +- gomp_barrier_wait (&team->barrier); ++ gomp_barrier_wait_last (&team->barrier); + gomp_barrier_wait (&gomp_threads_dock); + + local_fn = thr->fn; +@@ -133,21 +132,29 @@ gomp_thread_start (void *xdata) + + /* Create a new team data structure. */ + +-static struct gomp_team * +-new_team (unsigned nthreads, struct gomp_work_share *work_share) ++struct gomp_team * ++gomp_new_team (unsigned nthreads) + { + struct gomp_team *team; + size_t size; ++ int i; + + size = sizeof (*team) + nthreads * sizeof (team->ordered_release[0]); + team = gomp_malloc (size); +- gomp_mutex_init (&team->work_share_lock); + +- team->work_shares = gomp_malloc (4 * sizeof (struct gomp_work_share *)); +- team->generation_mask = 3; +- team->oldest_live_gen = work_share == NULL; +- team->num_live_gen = work_share != NULL; +- team->work_shares[0] = work_share; ++ team->work_share_chunk = 8; ++#ifdef HAVE_SYNC_BUILTINS ++ team->single_count = 0; ++#else ++ gomp_mutex_init (&team->work_share_list_free_lock); ++#endif ++ gomp_init_work_share (&team->work_shares[0], false, nthreads); ++ team->work_shares[0].next_alloc = NULL; ++ team->work_share_list_free = NULL; ++ team->work_share_list_alloc = &team->work_shares[1]; ++ for (i = 1; i < 7; i++) ++ team->work_shares[i].next_free = &team->work_shares[i + 1]; ++ team->work_shares[i].next_free = NULL; + + team->nthreads = nthreads; + gomp_barrier_init (&team->barrier, nthreads); +@@ -164,10 +171,22 @@ new_team (unsigned nthreads, struct gomp + static void + free_team (struct gomp_team *team) + { +- free (team->work_shares); +- gomp_mutex_destroy (&team->work_share_lock); ++ if (__builtin_expect (team->work_shares[0].next_alloc != NULL, 0)) ++ { ++ struct gomp_work_share *ws = team->work_shares[0].next_alloc; ++ do ++ { ++ struct gomp_work_share *next_ws = ws->next_alloc; ++ free (ws); ++ ws = next_ws; ++ } ++ while (ws != NULL); ++ } + gomp_barrier_destroy (&team->barrier); + gomp_sem_destroy (&team->master_release); ++#ifndef HAVE_SYNC_BUILTINS ++ gomp_mutex_destroy (&team->work_share_list_free_lock); ++#endif + free (team); + } + +@@ -176,11 +195,10 @@ free_team (struct gomp_team *team) + + void + gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, +- struct gomp_work_share *work_share) ++ struct gomp_team *team) + { + struct gomp_thread_start_data *start_data; + struct gomp_thread *thr, *nthr; +- struct gomp_team *team; + bool nested; + unsigned i, n, old_threads_used = 0; + pthread_attr_t thread_attr, *attr; +@@ -188,17 +206,18 @@ gomp_team_start (void (*fn) (void *), vo + thr = gomp_thread (); + nested = thr->ts.team != NULL; + +- team = new_team (nthreads, work_share); +- + /* Always save the previous state, even if this isn't a nested team. + In particular, we should save any work share state from an outer + orphaned work share construct. */ + team->prev_ts = thr->ts; + + thr->ts.team = team; +- thr->ts.work_share = work_share; + thr->ts.team_id = 0; +- thr->ts.work_share_generation = 0; ++ thr->ts.work_share = &team->work_shares[0]; ++ thr->ts.last_work_share = NULL; ++#ifdef HAVE_SYNC_BUILTINS ++ thr->ts.single_count = 0; ++#endif + thr->ts.static_trip = 0; + + if (nthreads == 1) +@@ -241,9 +260,12 @@ gomp_team_start (void (*fn) (void *), vo + { + nthr = gomp_threads[i]; + nthr->ts.team = team; +- nthr->ts.work_share = work_share; ++ nthr->ts.work_share = &team->work_shares[0]; ++ nthr->ts.last_work_share = NULL; + nthr->ts.team_id = i; +- nthr->ts.work_share_generation = 0; ++#ifdef HAVE_SYNC_BUILTINS ++ nthr->ts.single_count = 0; ++#endif + nthr->ts.static_trip = 0; + nthr->fn = fn; + nthr->data = data; +@@ -266,8 +288,24 @@ gomp_team_start (void (*fn) (void *), vo + } + } + ++ if (__builtin_expect (nthreads > old_threads_used, 0)) ++ { ++ long diff = (long) nthreads - (long) old_threads_used; ++ ++ if (old_threads_used == 0) ++ --diff; ++ ++#ifdef HAVE_SYNC_BUILTINS ++ __sync_fetch_and_add (&gomp_managed_threads, diff); ++#else ++ gomp_mutex_lock (&gomp_remaining_threads_lock); ++ gomp_managed_threads += diff; ++ gomp_mutex_unlock (&gomp_remaining_threads_lock); ++#endif ++ } ++ + attr = &gomp_thread_attr; +- if (gomp_cpu_affinity != NULL) ++ if (__builtin_expect (gomp_cpu_affinity != NULL, 0)) + { + size_t stacksize; + pthread_attr_init (&thread_attr); +@@ -287,9 +325,12 @@ gomp_team_start (void (*fn) (void *), vo + int err; + + start_data->ts.team = team; +- start_data->ts.work_share = work_share; ++ start_data->ts.work_share = &team->work_shares[0]; ++ start_data->ts.last_work_share = NULL; + start_data->ts.team_id = i; +- start_data->ts.work_share_generation = 0; ++#ifdef HAVE_SYNC_BUILTINS ++ start_data->ts.single_count = 0; ++#endif + start_data->ts.static_trip = 0; + start_data->fn = fn; + start_data->fn_data = data; +@@ -303,7 +344,7 @@ gomp_team_start (void (*fn) (void *), vo + gomp_fatal ("Thread creation failed: %s", strerror (err)); + } + +- if (gomp_cpu_affinity != NULL) ++ if (__builtin_expect (gomp_cpu_affinity != NULL, 0)) + pthread_attr_destroy (&thread_attr); + + do_release: +@@ -313,8 +354,20 @@ gomp_team_start (void (*fn) (void *), vo + that should arrive back at the end of this team. The extra + threads should be exiting. Note that we arrange for this test + to never be true for nested teams. */ +- if (nthreads < old_threads_used) +- gomp_barrier_reinit (&gomp_threads_dock, nthreads); ++ if (__builtin_expect (nthreads < old_threads_used, 0)) ++ { ++ long diff = (long) nthreads - (long) old_threads_used; ++ ++ gomp_barrier_reinit (&gomp_threads_dock, nthreads); ++ ++#ifdef HAVE_SYNC_BUILTINS ++ __sync_fetch_and_add (&gomp_managed_threads, diff); ++#else ++ gomp_mutex_lock (&gomp_remaining_threads_lock); ++ gomp_managed_threads += diff; ++ gomp_mutex_unlock (&gomp_remaining_threads_lock); ++#endif ++ } + } + + +@@ -329,8 +382,21 @@ gomp_team_end (void) + + gomp_barrier_wait (&team->barrier); + ++ gomp_fini_work_share (thr->ts.work_share); ++ + thr->ts = team->prev_ts; + ++ if (__builtin_expect (thr->ts.team != NULL, 0)) ++ { ++#ifdef HAVE_SYNC_BUILTINS ++ __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads); ++#else ++ gomp_mutex_lock (&gomp_remaining_threads_lock); ++ gomp_managed_threads -= team->nthreads - 1L; ++ gomp_mutex_unlock (&gomp_remaining_threads_lock); ++#endif ++ } ++ + free_team (team); + } + +--- libgomp/config/posix/bar.h.jj 2007-12-07 14:41:01.000000000 +0100 ++++ libgomp/config/posix/bar.h 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -46,18 +46,32 @@ typedef struct + unsigned total; + unsigned arrived; + } gomp_barrier_t; ++typedef bool gomp_barrier_state_t; + + extern void gomp_barrier_init (gomp_barrier_t *, unsigned); + extern void gomp_barrier_reinit (gomp_barrier_t *, unsigned); + extern void gomp_barrier_destroy (gomp_barrier_t *); + + extern void gomp_barrier_wait (gomp_barrier_t *); +-extern void gomp_barrier_wait_end (gomp_barrier_t *, bool); ++extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t); + +-static inline bool gomp_barrier_wait_start (gomp_barrier_t *bar) ++static inline gomp_barrier_state_t ++gomp_barrier_wait_start (gomp_barrier_t *bar) + { + gomp_mutex_lock (&bar->mutex1); + return ++bar->arrived == bar->total; + } + ++static inline bool ++gomp_barrier_last_thread (gomp_barrier_state_t state) ++{ ++ return state; ++} ++ ++static inline void ++gomp_barrier_wait_last (gomp_barrier_t *bar) ++{ ++ gomp_barrier_wait (bar); ++} ++ + #endif /* GOMP_BARRIER_H */ +--- libgomp/config/posix/ptrlock.h.jj 2008-03-26 15:11:32.000000000 +0100 ++++ libgomp/config/posix/ptrlock.h 2008-03-26 15:11:32.000000000 +0100 +@@ -0,0 +1,69 @@ ++/* Copyright (C) 2008 Free Software Foundation, Inc. ++ Contributed by Jakub Jelinek <jakub@redhat.com>. ++ ++ This file is part of the GNU OpenMP Library (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU Lesser General Public License as published by ++ the Free Software Foundation; either version 2.1 of the License, or ++ (at your option) any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for ++ more details. ++ ++ You should have received a copy of the GNU Lesser General Public License ++ along with libgomp; see the file COPYING.LIB. If not, write to the ++ Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, ++ MA 02110-1301, USA. */ ++ ++/* As a special exception, if you link this library with other files, some ++ of which are compiled with GCC, to produce an executable, this library ++ does not by itself cause the resulting executable to be covered by the ++ GNU General Public License. This exception does not however invalidate ++ any other reasons why the executable file might be covered by the GNU ++ General Public License. */ ++ ++/* This is a Linux specific implementation of a mutex synchronization ++ mechanism for libgomp. This type is private to the library. This ++ implementation uses atomic instructions and the futex syscall. */ ++ ++#ifndef GOMP_PTRLOCK_H ++#define GOMP_PTRLOCK_H 1 ++ ++typedef struct { void *ptr; gomp_mutex_t lock; } gomp_ptrlock_t; ++ ++static inline void gomp_ptrlock_init (gomp_ptrlock_t *ptrlock, void *ptr) ++{ ++ ptrlock->ptr = ptr; ++ gomp_mutex_init (&ptrlock->lock); ++} ++ ++static inline void *gomp_ptrlock_get (gomp_ptrlock_t *ptrlock) ++{ ++ if (ptrlock->ptr != NULL) ++ return ptrlock->ptr; ++ ++ gomp_mutex_lock (&ptrlock->lock); ++ if (ptrlock->ptr != NULL) ++ { ++ gomp_mutex_unlock (&ptrlock->lock); ++ return ptrlock->ptr; ++ } ++ ++ return NULL; ++} ++ ++static inline void gomp_ptrlock_set (gomp_ptrlock_t *ptrlock, void *ptr) ++{ ++ ptrlock->ptr = ptr; ++ gomp_mutex_unlock (&ptrlock->lock); ++} ++ ++static inline void gomp_ptrlock_destroy (gomp_ptrlock_t *ptrlock) ++{ ++ gomp_mutex_destroy (&ptrlock->lock); ++} ++ ++#endif /* GOMP_PTRLOCK_H */ +--- libgomp/config/posix/ptrlock.c.jj 2008-03-26 15:11:32.000000000 +0100 ++++ libgomp/config/posix/ptrlock.c 2008-03-26 15:11:32.000000000 +0100 +@@ -0,0 +1 @@ ++/* Everything is in the header. */ +--- libgomp/config/posix/bar.c.jj 2007-12-07 14:41:01.000000000 +0100 ++++ libgomp/config/posix/bar.c 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -70,7 +70,7 @@ gomp_barrier_reinit (gomp_barrier_t *bar + } + + void +-gomp_barrier_wait_end (gomp_barrier_t *bar, bool last) ++gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t last) + { + unsigned int n; + +--- libgomp/config/linux/alpha/futex.h.jj 2007-12-07 14:41:00.000000000 +0100 ++++ libgomp/config/linux/alpha/futex.h 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -30,8 +30,6 @@ + #ifndef SYS_futex + #define SYS_futex 394 + #endif +-#define FUTEX_WAIT 0 +-#define FUTEX_WAKE 1 + + + static inline void +@@ -45,7 +43,7 @@ futex_wait (int *addr, int val) + + sc_0 = SYS_futex; + sc_16 = (long) addr; +- sc_17 = FUTEX_WAIT; ++ sc_17 = gomp_futex_wait; + sc_18 = val; + sc_19 = 0; + __asm volatile ("callsys" +@@ -53,6 +51,20 @@ futex_wait (int *addr, int val) + : "0"(sc_0), "r" (sc_16), "r"(sc_17), "r"(sc_18), "1"(sc_19) + : "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", + "$22", "$23", "$24", "$25", "$27", "$28", "memory"); ++ if (__builtin_expect (sc_19, 0) && sc_0 == ENOSYS) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ sc_0 = SYS_futex; ++ sc_17 &= ~FUTEX_PRIVATE_FLAG; ++ sc_19 = 0; ++ __asm volatile ("callsys" ++ : "=r" (sc_0), "=r"(sc_19) ++ : "0"(sc_0), "r" (sc_16), "r"(sc_17), "r"(sc_18), ++ "1"(sc_19) ++ : "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", ++ "$22", "$23", "$24", "$25", "$27", "$28", "memory"); ++ } + } + + static inline void +@@ -66,11 +78,35 @@ futex_wake (int *addr, int count) + + sc_0 = SYS_futex; + sc_16 = (long) addr; +- sc_17 = FUTEX_WAKE; ++ sc_17 = gomp_futex_wake; + sc_18 = count; + __asm volatile ("callsys" + : "=r" (sc_0), "=r"(sc_19) + : "0"(sc_0), "r" (sc_16), "r"(sc_17), "r"(sc_18) + : "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", + "$22", "$23", "$24", "$25", "$27", "$28", "memory"); ++ if (__builtin_expect (sc_19, 0) && sc_0 == ENOSYS) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ sc_0 = SYS_futex; ++ sc_17 &= ~FUTEX_PRIVATE_FLAG; ++ __asm volatile ("callsys" ++ : "=r" (sc_0), "=r"(sc_19) ++ : "0"(sc_0), "r" (sc_16), "r"(sc_17), "r"(sc_18) ++ : "$1", "$2", "$3", "$4", "$5", "$6", "$7", "$8", ++ "$22", "$23", "$24", "$25", "$27", "$28", "memory"); ++ } ++} ++ ++static inline void ++cpu_relax (void) ++{ ++ __asm volatile ("" : : : "memory"); ++} ++ ++static inline void ++atomic_write_barrier (void) ++{ ++ __asm volatile ("wmb" : : : "memory"); + } +--- libgomp/config/linux/affinity.c.jj 2007-12-07 14:41:00.000000000 +0100 ++++ libgomp/config/linux/affinity.c 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2006, 2007 Free Software Foundation, Inc. ++/* Copyright (C) 2006, 2007, 2008 Free Software Foundation, Inc. + Contributed by Jakub Jelinek <jakub@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -38,9 +38,6 @@ + #ifdef HAVE_PTHREAD_AFFINITY_NP + + static unsigned int affinity_counter; +-#ifndef HAVE_SYNC_BUILTINS +-static gomp_mutex_t affinity_lock; +-#endif + + void + gomp_init_affinity (void) +@@ -76,9 +73,6 @@ gomp_init_affinity (void) + CPU_SET (gomp_cpu_affinity[0], &cpuset); + pthread_setaffinity_np (pthread_self (), sizeof (cpuset), &cpuset); + affinity_counter = 1; +-#ifndef HAVE_SYNC_BUILTINS +- gomp_mutex_init (&affinity_lock); +-#endif + } + + void +@@ -87,13 +81,7 @@ gomp_init_thread_affinity (pthread_attr_ + unsigned int cpu; + cpu_set_t cpuset; + +-#ifdef HAVE_SYNC_BUILTINS + cpu = __sync_fetch_and_add (&affinity_counter, 1); +-#else +- gomp_mutex_lock (&affinity_lock); +- cpu = affinity_counter++; +- gomp_mutex_unlock (&affinity_lock); +-#endif + cpu %= gomp_cpu_affinity_len; + CPU_ZERO (&cpuset); + CPU_SET (gomp_cpu_affinity[cpu], &cpuset); +--- libgomp/config/linux/bar.h.jj 2007-12-07 14:41:00.000000000 +0100 ++++ libgomp/config/linux/bar.h 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -36,40 +36,49 @@ + + typedef struct + { +- gomp_mutex_t mutex; +- unsigned total; +- unsigned arrived; +- int generation; ++ /* Make sure total/generation is in a mostly read cacheline, while ++ awaited in a separate cacheline. */ ++ unsigned total __attribute__((aligned (64))); ++ unsigned generation; ++ unsigned awaited __attribute__((aligned (64))); + } gomp_barrier_t; ++typedef unsigned int gomp_barrier_state_t; + + static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count) + { +- gomp_mutex_init (&bar->mutex); + bar->total = count; +- bar->arrived = 0; ++ bar->awaited = count; + bar->generation = 0; + } + + static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count) + { +- gomp_mutex_lock (&bar->mutex); ++ __sync_fetch_and_add (&bar->awaited, count - bar->total); + bar->total = count; +- gomp_mutex_unlock (&bar->mutex); + } + + static inline void gomp_barrier_destroy (gomp_barrier_t *bar) + { +- /* Before destroying, make sure all threads have left the barrier. */ +- gomp_mutex_lock (&bar->mutex); + } + + extern void gomp_barrier_wait (gomp_barrier_t *); +-extern void gomp_barrier_wait_end (gomp_barrier_t *, bool); ++extern void gomp_barrier_wait_last (gomp_barrier_t *); ++extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t); + +-static inline bool gomp_barrier_wait_start (gomp_barrier_t *bar) ++static inline gomp_barrier_state_t ++gomp_barrier_wait_start (gomp_barrier_t *bar) + { +- gomp_mutex_lock (&bar->mutex); +- return ++bar->arrived == bar->total; ++ unsigned int ret = bar->generation; ++ /* Do we need any barrier here or is __sync_add_and_fetch acting ++ as the needed LoadLoad barrier already? */ ++ ret += __sync_add_and_fetch (&bar->awaited, -1) == 0; ++ return ret; ++} ++ ++static inline bool ++gomp_barrier_last_thread (gomp_barrier_state_t state) ++{ ++ return state & 1; + } + + #endif /* GOMP_BARRIER_H */ +--- libgomp/config/linux/ptrlock.h.jj 2008-03-26 15:11:32.000000000 +0100 ++++ libgomp/config/linux/ptrlock.h 2008-03-26 15:11:32.000000000 +0100 +@@ -0,0 +1,65 @@ ++/* Copyright (C) 2008 Free Software Foundation, Inc. ++ Contributed by Jakub Jelinek <jakub@redhat.com>. ++ ++ This file is part of the GNU OpenMP Library (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU Lesser General Public License as published by ++ the Free Software Foundation; either version 2.1 of the License, or ++ (at your option) any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for ++ more details. ++ ++ You should have received a copy of the GNU Lesser General Public License ++ along with libgomp; see the file COPYING.LIB. If not, write to the ++ Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, ++ MA 02110-1301, USA. */ ++ ++/* As a special exception, if you link this library with other files, some ++ of which are compiled with GCC, to produce an executable, this library ++ does not by itself cause the resulting executable to be covered by the ++ GNU General Public License. This exception does not however invalidate ++ any other reasons why the executable file might be covered by the GNU ++ General Public License. */ ++ ++/* This is a Linux specific implementation of a mutex synchronization ++ mechanism for libgomp. This type is private to the library. This ++ implementation uses atomic instructions and the futex syscall. */ ++ ++#ifndef GOMP_PTRLOCK_H ++#define GOMP_PTRLOCK_H 1 ++ ++typedef void *gomp_ptrlock_t; ++ ++static inline void gomp_ptrlock_init (gomp_ptrlock_t *ptrlock, void *ptr) ++{ ++ *ptrlock = ptr; ++} ++ ++extern void *gomp_ptrlock_get_slow (gomp_ptrlock_t *ptrlock); ++static inline void *gomp_ptrlock_get (gomp_ptrlock_t *ptrlock) ++{ ++ if ((uintptr_t) *ptrlock > 2) ++ return *ptrlock; ++ ++ if (__sync_bool_compare_and_swap (ptrlock, NULL, (uintptr_t) 1)) ++ return NULL; ++ ++ return gomp_ptrlock_get_slow (ptrlock); ++} ++ ++extern void gomp_ptrlock_set_slow (gomp_ptrlock_t *ptrlock, void *ptr); ++static inline void gomp_ptrlock_set (gomp_ptrlock_t *ptrlock, void *ptr) ++{ ++ if (!__sync_bool_compare_and_swap (ptrlock, (uintptr_t) 1, ptr)) ++ gomp_ptrlock_set_slow (ptrlock, ptr); ++} ++ ++static inline void gomp_ptrlock_destroy (gomp_ptrlock_t *ptrlock) ++{ ++} ++ ++#endif /* GOMP_PTRLOCK_H */ +--- libgomp/config/linux/lock.c.jj 2007-12-07 14:41:00.000000000 +0100 ++++ libgomp/config/linux/lock.c 2008-03-26 15:11:32.000000000 +0100 +@@ -29,11 +29,10 @@ + primitives. This implementation uses atomic instructions and the futex + syscall. */ + +-#include "libgomp.h" + #include <string.h> + #include <unistd.h> + #include <sys/syscall.h> +-#include "futex.h" ++#include "wait.h" + + + /* The internal gomp_mutex_t and the external non-recursive omp_lock_t +@@ -137,7 +136,7 @@ omp_set_nest_lock (omp_nest_lock_t *lock + return; + } + +- futex_wait (&lock->owner, otid); ++ do_wait (&lock->owner, otid); + } + } + +--- libgomp/config/linux/ptrlock.c.jj 2008-03-26 15:11:32.000000000 +0100 ++++ libgomp/config/linux/ptrlock.c 2008-03-26 15:11:32.000000000 +0100 +@@ -0,0 +1,70 @@ ++/* Copyright (C) 2008 Free Software Foundation, Inc. ++ Contributed by Jakub Jelinek <jakub@redhat.com>. ++ ++ This file is part of the GNU OpenMP Library (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU Lesser General Public License as published by ++ the Free Software Foundation; either version 2.1 of the License, or ++ (at your option) any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for ++ more details. ++ ++ You should have received a copy of the GNU Lesser General Public License ++ along with libgomp; see the file COPYING.LIB. If not, write to the ++ Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, ++ MA 02110-1301, USA. */ ++ ++/* As a special exception, if you link this library with other files, some ++ of which are compiled with GCC, to produce an executable, this library ++ does not by itself cause the resulting executable to be covered by the ++ GNU General Public License. This exception does not however invalidate ++ any other reasons why the executable file might be covered by the GNU ++ General Public License. */ ++ ++/* This is a Linux specific implementation of a mutex synchronization ++ mechanism for libgomp. This type is private to the library. This ++ implementation uses atomic instructions and the futex syscall. */ ++ ++#include <endian.h> ++#include <limits.h> ++#include "wait.h" ++ ++void * ++gomp_ptrlock_get_slow (gomp_ptrlock_t *ptrlock) ++{ ++ int *intptr; ++ __sync_bool_compare_and_swap (ptrlock, 1, 2); ++ ++ /* futex works on ints, not pointers. ++ But a valid work share pointer will be at least ++ 8 byte aligned, so it is safe to assume the low ++ 32-bits of the pointer won't contain values 1 or 2. */ ++ __asm volatile ("" : "=r" (intptr) : "0" (ptrlock)); ++#if __BYTE_ORDER == __BIG_ENDIAN ++ if (sizeof (*ptrlock) > sizeof (int)) ++ intptr += (sizeof (*ptrlock) / sizeof (int)) - 1; ++#endif ++ do ++ do_wait (intptr, 2); ++ while (*intptr == 2); ++ __asm volatile ("" : : : "memory"); ++ return *ptrlock; ++} ++ ++void ++gomp_ptrlock_set_slow (gomp_ptrlock_t *ptrlock, void *ptr) ++{ ++ int *intptr; ++ ++ *ptrlock = ptr; ++ __asm volatile ("" : "=r" (intptr) : "0" (ptrlock)); ++#if __BYTE_ORDER == __BIG_ENDIAN ++ if (sizeof (*ptrlock) > sizeof (int)) ++ intptr += (sizeof (*ptrlock) / sizeof (int)) - 1; ++#endif ++ futex_wake (intptr, INT_MAX); ++} +--- libgomp/config/linux/x86/futex.h.jj 2007-12-07 14:41:00.000000000 +0100 ++++ libgomp/config/linux/x86/futex.h 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -27,9 +27,6 @@ + + /* Provide target-specific access to the futex system call. */ + +-#define FUTEX_WAIT 0 +-#define FUTEX_WAKE 1 +- + #ifdef __LP64__ + # ifndef SYS_futex + # define SYS_futex 202 +@@ -38,14 +35,26 @@ + static inline void + futex_wait (int *addr, int val) + { +- register long r10 __asm__("%r10") = 0; ++ register long r10 __asm__("%r10"); + long res; + ++ r10 = 0; + __asm volatile ("syscall" + : "=a" (res) +- : "0"(SYS_futex), "D" (addr), "S"(FUTEX_WAIT), +- "d"(val), "r"(r10) ++ : "0" (SYS_futex), "D" (addr), "S" (gomp_futex_wait), ++ "d" (val), "r" (r10) + : "r11", "rcx", "memory"); ++ if (__builtin_expect (res == -ENOSYS, 0)) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ r10 = 0; ++ __asm volatile ("syscall" ++ : "=a" (res) ++ : "0" (SYS_futex), "D" (addr), "S" (gomp_futex_wait), ++ "d" (val), "r" (r10) ++ : "r11", "rcx", "memory"); ++ } + } + + static inline void +@@ -55,8 +64,19 @@ futex_wake (int *addr, int count) + + __asm volatile ("syscall" + : "=a" (res) +- : "0"(SYS_futex), "D" (addr), "S"(FUTEX_WAKE), "d"(count) ++ : "0" (SYS_futex), "D" (addr), "S" (gomp_futex_wake), ++ "d" (count) + : "r11", "rcx", "memory"); ++ if (__builtin_expect (res == -ENOSYS, 0)) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ __asm volatile ("syscall" ++ : "=a" (res) ++ : "0" (SYS_futex), "D" (addr), "S" (gomp_futex_wake), ++ "d" (count) ++ : "r11", "rcx", "memory"); ++ } + } + #else + # ifndef SYS_futex +@@ -65,7 +85,7 @@ futex_wake (int *addr, int count) + + # ifdef __PIC__ + +-static inline void ++static inline long + sys_futex0 (int *addr, int op, int val) + { + long res; +@@ -77,11 +97,12 @@ sys_futex0 (int *addr, int op, int val) + : "0"(SYS_futex), "r" (addr), "c"(op), + "d"(val), "S"(0) + : "memory"); ++ return res; + } + + # else + +-static inline void ++static inline long + sys_futex0 (int *addr, int op, int val) + { + long res; +@@ -91,6 +112,7 @@ sys_futex0 (int *addr, int op, int val) + : "0"(SYS_futex), "b" (addr), "c"(op), + "d"(val), "S"(0) + : "memory"); ++ return res; + } + + # endif /* __PIC__ */ +@@ -98,13 +120,37 @@ sys_futex0 (int *addr, int op, int val) + static inline void + futex_wait (int *addr, int val) + { +- sys_futex0 (addr, FUTEX_WAIT, val); ++ long res = sys_futex0 (addr, gomp_futex_wait, val); ++ if (__builtin_expect (res == -ENOSYS, 0)) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ sys_futex0 (addr, gomp_futex_wait, val); ++ } + } + + static inline void + futex_wake (int *addr, int count) + { +- sys_futex0 (addr, FUTEX_WAKE, count); ++ long res = sys_futex0 (addr, gomp_futex_wake, count); ++ if (__builtin_expect (res == -ENOSYS, 0)) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ sys_futex0 (addr, gomp_futex_wake, count); ++ } + } + + #endif /* __LP64__ */ ++ ++static inline void ++cpu_relax (void) ++{ ++ __asm volatile ("rep; nop" : : : "memory"); ++} ++ ++static inline void ++atomic_write_barrier (void) ++{ ++ __sync_synchronize (); ++} +--- libgomp/config/linux/wait.h.jj 2008-03-26 15:11:32.000000000 +0100 ++++ libgomp/config/linux/wait.h 2008-03-26 15:11:32.000000000 +0100 +@@ -0,0 +1,68 @@ ++/* Copyright (C) 2008 Free Software Foundation, Inc. ++ Contributed by Jakub Jelinek <jakub@redhat.com>. ++ ++ This file is part of the GNU OpenMP Library (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU Lesser General Public License as published by ++ the Free Software Foundation; either version 2.1 of the License, or ++ (at your option) any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for ++ more details. ++ ++ You should have received a copy of the GNU Lesser General Public License ++ along with libgomp; see the file COPYING.LIB. If not, write to the ++ Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, ++ MA 02110-1301, USA. */ ++ ++/* As a special exception, if you link this library with other files, some ++ of which are compiled with GCC, to produce an executable, this library ++ does not by itself cause the resulting executable to be covered by the ++ GNU General Public License. This exception does not however invalidate ++ any other reasons why the executable file might be covered by the GNU ++ General Public License. */ ++ ++/* This is a Linux specific implementation of a mutex synchronization ++ mechanism for libgomp. This type is private to the library. This ++ implementation uses atomic instructions and the futex syscall. */ ++ ++#ifndef GOMP_WAIT_H ++#define GOMP_WAIT_H 1 ++ ++#include "libgomp.h" ++#include <errno.h> ++ ++#define FUTEX_WAIT 0 ++#define FUTEX_WAKE 1 ++#define FUTEX_PRIVATE_FLAG 128L ++ ++#ifdef HAVE_ATTRIBUTE_VISIBILITY ++# pragma GCC visibility push(hidden) ++#endif ++ ++extern long int gomp_futex_wait, gomp_futex_wake; ++ ++#include "futex.h" ++ ++static inline void do_wait (int *addr, int val) ++{ ++ unsigned long long i, count = gomp_spin_count_var; ++ ++ if (__builtin_expect (gomp_managed_threads > gomp_available_cpus, 0)) ++ count = gomp_throttled_spin_count_var; ++ for (i = 0; i < count; i++) ++ if (__builtin_expect (*addr != val, 0)) ++ return; ++ else ++ cpu_relax (); ++ futex_wait (addr, val); ++} ++ ++#ifdef HAVE_ATTRIBUTE_VISIBILITY ++# pragma GCC visibility pop ++#endif ++ ++#endif /* GOMP_WAIT_H */ +--- libgomp/config/linux/sparc/futex.h.jj 2007-12-07 14:41:00.000000000 +0100 ++++ libgomp/config/linux/sparc/futex.h 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Jakub Jelinek <jakub@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -28,10 +28,8 @@ + /* Provide target-specific access to the futex system call. */ + + #include <sys/syscall.h> +-#define FUTEX_WAIT 0 +-#define FUTEX_WAKE 1 + +-static inline void ++static inline long + sys_futex0 (int *addr, int op, int val) + { + register long int g1 __asm__ ("g1"); +@@ -47,9 +45,9 @@ sys_futex0 (int *addr, int op, int val) + o3 = 0; + + #ifdef __arch64__ +-# define SYSCALL_STRING "ta\t0x6d" ++# define SYSCALL_STRING "ta\t0x6d; bcs,a,pt %%xcc, 1f; sub %%g0, %%o0, %%o0; 1:" + #else +-# define SYSCALL_STRING "ta\t0x10" ++# define SYSCALL_STRING "ta\t0x10; bcs,a 1f; sub %%g0, %%o0, %%o0; 1:" + #endif + + __asm volatile (SYSCALL_STRING +@@ -65,16 +63,49 @@ sys_futex0 (int *addr, int op, int val) + "f48", "f50", "f52", "f54", "f56", "f58", "f60", "f62", + #endif + "cc", "memory"); ++ return o0; + } + + static inline void + futex_wait (int *addr, int val) + { +- sys_futex0 (addr, FUTEX_WAIT, val); ++ long err = sys_futex0 (addr, gomp_futex_wait, val); ++ if (__builtin_expect (err == ENOSYS, 0)) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ sys_futex0 (addr, gomp_futex_wait, val); ++ } + } + + static inline void + futex_wake (int *addr, int count) + { +- sys_futex0 (addr, FUTEX_WAKE, count); ++ long err = sys_futex0 (addr, gomp_futex_wake, count); ++ if (__builtin_expect (err == ENOSYS, 0)) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ sys_futex0 (addr, gomp_futex_wake, count); ++ } ++} ++ ++static inline void ++cpu_relax (void) ++{ ++#if defined __arch64__ || defined __sparc_v9__ ++ __asm volatile ("membar #LoadLoad" : : : "memory"); ++#else ++ __asm volatile ("" : : : "memory"); ++#endif ++} ++ ++static inline void ++atomic_write_barrier (void) ++{ ++#if defined __arch64__ || defined __sparc_v9__ ++ __asm volatile ("membar #StoreStore" : : : "memory"); ++#else ++ __sync_synchronize (); ++#endif + } +--- libgomp/config/linux/ia64/futex.h.jj 2007-12-07 14:41:00.000000000 +0100 ++++ libgomp/config/linux/ia64/futex.h 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -29,23 +29,24 @@ + + #include <sys/syscall.h> + +-#define FUTEX_WAIT 0 +-#define FUTEX_WAKE 1 + + +-static inline void +-sys_futex0(int *addr, int op, int val) ++static inline long ++sys_futex0(int *addr, long op, int val) + { + register long out0 asm ("out0") = (long) addr; + register long out1 asm ("out1") = op; + register long out2 asm ("out2") = val; + register long out3 asm ("out3") = 0; ++ register long r8 asm ("r8"); ++ register long r10 asm ("r10"); + register long r15 asm ("r15") = SYS_futex; + + __asm __volatile ("break 0x100000" +- : "=r"(r15), "=r"(out0), "=r"(out1), "=r"(out2), "=r"(out3) ++ : "=r"(r15), "=r"(out0), "=r"(out1), "=r"(out2), "=r"(out3), ++ "=r"(r8), "=r"(r10) + : "r"(r15), "r"(out0), "r"(out1), "r"(out2), "r"(out3) +- : "memory", "r8", "r10", "out4", "out5", "out6", "out7", ++ : "memory", "out4", "out5", "out6", "out7", + /* Non-stacked integer registers, minus r8, r10, r15. */ + "r2", "r3", "r9", "r11", "r12", "r13", "r14", "r16", "r17", "r18", + "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", +@@ -56,16 +57,41 @@ sys_futex0(int *addr, int op, int val) + "f6", "f7", "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", + /* Branch registers. */ + "b6"); ++ return r8 & r10; + } + + static inline void + futex_wait (int *addr, int val) + { +- sys_futex0 (addr, FUTEX_WAIT, val); ++ long err = sys_futex0 (addr, gomp_futex_wait, val); ++ if (__builtin_expect (err == ENOSYS, 0)) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ sys_futex0 (addr, gomp_futex_wait, val); ++ } + } + + static inline void + futex_wake (int *addr, int count) + { +- sys_futex0 (addr, FUTEX_WAKE, count); ++ long err = sys_futex0 (addr, gomp_futex_wake, count); ++ if (__builtin_expect (err == ENOSYS, 0)) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ sys_futex0 (addr, gomp_futex_wake, count); ++ } ++} ++ ++static inline void ++cpu_relax (void) ++{ ++ __asm volatile ("hint @pause" : : : "memory"); ++} ++ ++static inline void ++atomic_write_barrier (void) ++{ ++ __sync_synchronize (); + } +--- libgomp/config/linux/s390/futex.h.jj 2007-12-07 14:41:00.000000000 +0100 ++++ libgomp/config/linux/s390/futex.h 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Jakub Jelinek <jakub@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -28,10 +28,8 @@ + /* Provide target-specific access to the futex system call. */ + + #include <sys/syscall.h> +-#define FUTEX_WAIT 0 +-#define FUTEX_WAKE 1 + +-static inline void ++static inline long + sys_futex0 (int *addr, int op, int val) + { + register long int gpr2 __asm__ ("2"); +@@ -49,16 +47,41 @@ sys_futex0 (int *addr, int op, int val) + : "i" (SYS_futex), + "0" (gpr2), "d" (gpr3), "d" (gpr4), "d" (gpr5) + : "memory"); ++ return gpr2; + } + + static inline void + futex_wait (int *addr, int val) + { +- sys_futex0 (addr, FUTEX_WAIT, val); ++ long err = sys_futex0 (addr, gomp_futex_wait, val); ++ if (__builtin_expect (err == -ENOSYS, 0)) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ sys_futex0 (addr, gomp_futex_wait, val); ++ } + } + + static inline void + futex_wake (int *addr, int count) + { +- sys_futex0 (addr, FUTEX_WAKE, count); ++ long err = sys_futex0 (addr, gomp_futex_wake, count); ++ if (__builtin_expect (err == -ENOSYS, 0)) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ sys_futex0 (addr, gomp_futex_wake, count); ++ } ++} ++ ++static inline void ++cpu_relax (void) ++{ ++ __asm volatile ("" : : : "memory"); ++} ++ ++static inline void ++atomic_write_barrier (void) ++{ ++ __sync_synchronize (); + } +--- libgomp/config/linux/mutex.c.jj 2007-12-07 14:41:00.000000000 +0100 ++++ libgomp/config/linux/mutex.c 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -29,9 +29,10 @@ + mechanism for libgomp. This type is private to the library. This + implementation uses atomic instructions and the futex syscall. */ + +-#include "libgomp.h" +-#include "futex.h" ++#include "wait.h" + ++long int gomp_futex_wake = FUTEX_WAKE | FUTEX_PRIVATE_FLAG; ++long int gomp_futex_wait = FUTEX_WAIT | FUTEX_PRIVATE_FLAG; + + void + gomp_mutex_lock_slow (gomp_mutex_t *mutex) +@@ -40,7 +41,7 @@ gomp_mutex_lock_slow (gomp_mutex_t *mute + { + int oldval = __sync_val_compare_and_swap (mutex, 1, 2); + if (oldval != 0) +- futex_wait (mutex, 2); ++ do_wait (mutex, 2); + } + while (!__sync_bool_compare_and_swap (mutex, 0, 2)); + } +--- libgomp/config/linux/sem.c.jj 2007-12-07 14:41:00.000000000 +0100 ++++ libgomp/config/linux/sem.c 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -29,8 +29,7 @@ + mechanism for libgomp. This type is private to the library. This + implementation uses atomic instructions and the futex syscall. */ + +-#include "libgomp.h" +-#include "futex.h" ++#include "wait.h" + + + void +@@ -44,7 +43,7 @@ gomp_sem_wait_slow (gomp_sem_t *sem) + if (__sync_bool_compare_and_swap (sem, val, val - 1)) + return; + } +- futex_wait (sem, -1); ++ do_wait (sem, -1); + } + } + +--- libgomp/config/linux/powerpc/futex.h.jj 2007-12-07 14:41:00.000000000 +0100 ++++ libgomp/config/linux/powerpc/futex.h 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -28,10 +28,8 @@ + /* Provide target-specific access to the futex system call. */ + + #include <sys/syscall.h> +-#define FUTEX_WAIT 0 +-#define FUTEX_WAKE 1 + +-static inline void ++static inline long + sys_futex0 (int *addr, int op, int val) + { + register long int r0 __asm__ ("r0"); +@@ -50,21 +48,48 @@ sys_futex0 (int *addr, int op, int val) + doesn't. It doesn't much matter for us. In the interest of unity, + go ahead and clobber it always. */ + +- __asm volatile ("sc" ++ __asm volatile ("sc; mfcr %0" + : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6) + : "r"(r0), "r"(r3), "r"(r4), "r"(r5), "r"(r6) + : "r7", "r8", "r9", "r10", "r11", "r12", + "cr0", "ctr", "memory"); ++ if (__builtin_expect (r0 & (1 << 28), 0)) ++ return r3; ++ return 0; + } + + static inline void + futex_wait (int *addr, int val) + { +- sys_futex0 (addr, FUTEX_WAIT, val); ++ long err = sys_futex0 (addr, gomp_futex_wait, val); ++ if (__builtin_expect (err == ENOSYS, 0)) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ sys_futex0 (addr, gomp_futex_wait, val); ++ } + } + + static inline void + futex_wake (int *addr, int count) + { +- sys_futex0 (addr, FUTEX_WAKE, count); ++ long err = sys_futex0 (addr, gomp_futex_wake, count); ++ if (__builtin_expect (err == ENOSYS, 0)) ++ { ++ gomp_futex_wait &= ~FUTEX_PRIVATE_FLAG; ++ gomp_futex_wake &= ~FUTEX_PRIVATE_FLAG; ++ sys_futex0 (addr, gomp_futex_wake, count); ++ } ++} ++ ++static inline void ++cpu_relax (void) ++{ ++ __asm volatile ("" : : : "memory"); ++} ++ ++static inline void ++atomic_write_barrier (void) ++{ ++ __asm volatile ("eieio" : : : "memory"); + } +--- libgomp/config/linux/bar.c.jj 2007-12-07 14:41:00.000000000 +0100 ++++ libgomp/config/linux/bar.c 2008-03-26 15:11:32.000000000 +0100 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005 Free Software Foundation, Inc. ++/* Copyright (C) 2005, 2008 Free Software Foundation, Inc. + Contributed by Richard Henderson <rth@redhat.com>. + + This file is part of the GNU OpenMP Library (libgomp). +@@ -29,32 +29,29 @@ + mechanism for libgomp. This type is private to the library. This + implementation uses atomic instructions and the futex syscall. */ + +-#include "libgomp.h" +-#include "futex.h" + #include <limits.h> ++#include "wait.h" + + + void +-gomp_barrier_wait_end (gomp_barrier_t *bar, bool last) ++gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) + { +- if (last) ++ if (__builtin_expect ((state & 1) != 0, 0)) + { +- bar->generation++; +- futex_wake (&bar->generation, INT_MAX); ++ /* Next time we'll be awaiting TOTAL threads again. */ ++ bar->awaited = bar->total; ++ atomic_write_barrier (); ++ bar->generation += 2; ++ futex_wake ((int *) &bar->generation, INT_MAX); + } + else + { +- unsigned int generation = bar->generation; +- +- gomp_mutex_unlock (&bar->mutex); ++ unsigned int generation = state; + + do +- futex_wait (&bar->generation, generation); ++ do_wait ((int *) &bar->generation, generation); + while (bar->generation == generation); + } +- +- if (__sync_add_and_fetch (&bar->arrived, -1) == 0) +- gomp_mutex_unlock (&bar->mutex); + } + + void +@@ -62,3 +59,18 @@ gomp_barrier_wait (gomp_barrier_t *barri + { + gomp_barrier_wait_end (barrier, gomp_barrier_wait_start (barrier)); + } ++ ++/* Like gomp_barrier_wait, except that if the encountering thread ++ is not the last one to hit the barrier, it returns immediately. ++ The intended usage is that a thread which intends to gomp_barrier_destroy ++ this barrier calls gomp_barrier_wait, while all other threads ++ call gomp_barrier_wait_last. When gomp_barrier_wait returns, ++ the barrier can be safely destroyed. */ ++ ++void ++gomp_barrier_wait_last (gomp_barrier_t *barrier) ++{ ++ gomp_barrier_state_t state = gomp_barrier_wait_start (barrier); ++ if (state & 1) ++ gomp_barrier_wait_end (barrier, state); ++} |